You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/09/16 00:24:53 UTC
[6/8] Porting Lucene.Net.Suggest (still not compiling)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/SpellChecker.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/SpellChecker.cs b/src/Lucene.Net.Suggest/Spell/SpellChecker.cs
new file mode 100644
index 0000000..82f9810
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/SpellChecker.cs
@@ -0,0 +1,748 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Version = Lucene.Net.Util.Version;
+using Directory = Lucene.Net.Store.Directory;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// <para>
+ /// Spell Checker class (Main class) <br/>
+ /// (initially inspired by the David Spencer code).
+ /// </para>
+ ///
+ /// <para>Example Usage:
+ ///
+ /// <pre class="prettyprint">
+ /// SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
+ /// // To index a field of a user index:
+ /// spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
+ /// // To index a file containing words:
+ /// spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
+ /// String[] suggestions = spellchecker.suggestSimilar("misspelt", 5);
+ /// </pre>
+ ///
+ ///
+ /// </para>
+ /// </summary>
+ public class SpellChecker : IDisposable
+ {
+
+ /// <summary>
+ /// The default minimum score to use, if not specified by calling <seealso cref="#setAccuracy(float)"/> .
+ /// </summary>
+ public const float DEFAULT_ACCURACY = 0.5f;
+
+ /// <summary>
+ /// Field name for each word in the ngram index.
+ /// </summary>
+ public const string F_WORD = "word";
+
+ /// <summary>
+ /// the spell index
+ /// </summary>
+ // don't modify the directory directly - see #swapSearcher()
+ // TODO: why is this package private?
+ internal Directory spellIndex;
+ /// <summary>
+ /// Boost value for start and end grams
+ /// </summary>
+ private float bStart = 2.0f;
+
+ private float bEnd = 1.0f;
+ // don't use this searcher directly - see #swapSearcher()
+
+ private IndexSearcher searcher;
+ /*
+ * this locks all modifications to the current searcher.
+ */
+
+ private readonly object searcherLock = new object();
+ /*
+ * this lock synchronizes all possible modifications to the
+ * current index directory. It should not be possible to try modifying
+ * the same index concurrently. Note: Do not acquire the searcher lock
+ * before acquiring this lock!
+ */
+ private readonly object modifyCurrentIndexLock = new object();
+
+ private volatile bool closed = false;
+ // minimum score for hits generated by the spell checker query
+
+ private float accuracy = DEFAULT_ACCURACY;
+
+ private StringDistance sd;
+ private IComparer<SuggestWord> comparator;
+
+ /// <summary>
+ /// Use the given directory as a spell checker index. The directory
+ /// is created if it doesn't exist yet. </summary>
+ /// <param name="spellIndex"> the spell index directory </param>
+ /// <param name="sd"> the <seealso cref="StringDistance"/> measurement to use </param>
+ /// <exception cref="IOException"> if Spellchecker can not open the directory </exception>
+ public SpellChecker(Directory spellIndex, StringDistance sd)
+ : this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR)
+ {
+ }
+ /// <summary>
+ /// Use the given directory as a spell checker index with a
+ /// <seealso cref="LevensteinDistance"/> as the default <seealso cref="StringDistance"/>. The
+ /// directory is created if it doesn't exist yet.
+ /// </summary>
+ /// <param name="spellIndex">
+ /// the spell index directory </param>
+ /// <exception cref="IOException">
+ /// if spellchecker can not open the directory </exception>
+ public SpellChecker(Directory spellIndex)
+ : this(spellIndex, new LevensteinDistance())
+ {
+ }
+
+ /// <summary>
+ /// Use the given directory as a spell checker index with the given <seealso cref="Lucene.Net.Search.Spell.StringDistance"/> measure
+ /// and the given <seealso cref="java.util.Comparator"/> for sorting the results. </summary>
+ /// <param name="spellIndex"> The spelling index </param>
+ /// <param name="sd"> The distance </param>
+ /// <param name="comparator"> The comparator </param>
+ /// <exception cref="IOException"> if there is a problem opening the index </exception>
+ public SpellChecker(Directory spellIndex, StringDistance sd, IComparer<SuggestWord> comparator)
+ {
+ SpellIndex = spellIndex;
+ StringDistance = sd;
+ this.comparator = comparator;
+ }
+
+ /// <summary>
+ /// Use a different index as the spell checker index or re-open
+ /// the existing index if <code>spellIndex</code> is the same value
+ /// as given in the constructor. </summary>
+ /// <param name="spellIndexDir"> the spell directory to use </param>
+ /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
+ /// <exception cref="System.IO.IOException"> if spellchecker can not open the directory </exception>
+ // TODO: we should make this final as it is called in the constructor
+ public virtual Directory SpellIndex
+ {
+ set
+ {
+ // this could be the same directory as the current spellIndex
+ // modifications to the directory should be synchronized
+ lock (modifyCurrentIndexLock)
+ {
+ EnsureOpen();
+ if (!DirectoryReader.IndexExists(value))
+ {
+ using (var writer = new IndexWriter(value, new IndexWriterConfig(Version.LUCENE_CURRENT, null)))
+ {
+ }
+ }
+ SwapSearcher(value);
+ }
+ }
+ }
+
+ /// <summary>
+ /// Sets the <seealso cref="java.util.Comparator"/> for the <seealso cref="SuggestWordQueue"/>. </summary>
+ /// <param name="comparator"> the comparator </param>
+ public virtual IComparer<SuggestWord> Comparator
+ {
+ set
+ {
+ this.comparator = value;
+ }
+ get
+ {
+ return comparator;
+ }
+ }
+
+
+ /// <summary>
+ /// Sets the <seealso cref="StringDistance"/> implementation for this
+ /// <seealso cref="SpellChecker"/> instance.
+ /// </summary>
+ /// <param name="sd"> the <seealso cref="StringDistance"/> implementation for this
+ /// <seealso cref="SpellChecker"/> instance </param>
+ public virtual StringDistance StringDistance
+ {
+ set
+ {
+ this.sd = value;
+ }
+ get
+ {
+ return sd;
+ }
+ }
+
+ /// <summary>
+ /// Sets the accuracy 0 < minScore < 1; default <seealso cref="#DEFAULT_ACCURACY"/> </summary>
+ /// <param name="acc"> The new accuracy </param>
+ public virtual float Accuracy
+ {
+ set
+ {
+ this.accuracy = value;
+ }
+ get
+ {
+ return accuracy;
+ }
+ }
+
+
+ /// <summary>
+ /// Suggest similar words.
+ ///
+ /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
+ /// is not the same as the edit distance strategy used to calculate the best
+ /// matching spell-checked word from the hits that Lucene found, one usually has
+ /// to retrieve a couple of numSug's in order to get the true best match.
+ ///
+ /// </para>
+ /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one.
+ /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
+ ///
+ /// </para>
+ /// </summary>
+ /// <param name="word"> the word you want a spell check done on </param>
+ /// <param name="numSug"> the number of suggested words </param>
+ /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception>
+ /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
+ /// <returns> String[]
+ /// </returns>
+ /// <seealso cref= #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) </seealso>
+ public virtual string[] SuggestSimilar(string word, int numSug)
+ {
+ return this.SuggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
+ }
+
+ /// <summary>
+ /// Suggest similar words.
+ ///
+ /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
+ /// is not the same as the edit distance strategy used to calculate the best
+ /// matching spell-checked word from the hits that Lucene found, one usually has
+ /// to retrieve a couple of numSug's in order to get the true best match.
+ ///
+ /// </para>
+ /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one.
+ /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
+ ///
+ /// </para>
+ /// </summary>
+ /// <param name="word"> the word you want a spell check done on </param>
+ /// <param name="numSug"> the number of suggested words </param>
+ /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param>
+ /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception>
+ /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
+ /// <returns> String[]
+ /// </returns>
+ /// <seealso cref= #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) </seealso>
+ public virtual string[] SuggestSimilar(string word, int numSug, float accuracy)
+ {
+ return this.SuggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
+ }
+
+ /// <summary>
+ /// Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
+ /// suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)}
+ ///
+ /// </summary>
+ public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode)
+ {
+ return SuggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
+ }
+
+ /// <summary>
+ /// Suggest similar words (optionally restricted to a field of an index).
+ ///
+ /// <para>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
+ /// is not the same as the edit distance strategy used to calculate the best
+ /// matching spell-checked word from the hits that Lucene found, one usually has
+ /// to retrieve a couple of numSug's in order to get the true best match.
+ ///
+ /// </para>
+ /// <para>I.e. if numSug == 1, don't count on that suggestion being the best one.
+ /// Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
+ ///
+ /// </para>
+ /// </summary>
+ /// <param name="word"> the word you want a spell check done on </param>
+ /// <param name="numSug"> the number of suggested words </param>
+ /// <param name="ir"> the indexReader of the user index (can be null see field param) </param>
+ /// <param name="field"> the field of the user index: if field is not null, the suggested
+ /// words are restricted to the words present in this field. </param>
+ /// <param name="suggestMode">
+ /// (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS) </param>
+ /// <param name="accuracy"> The minimum score a suggestion must have in order to qualify for inclusion in the results </param>
+ /// <exception cref="IOException"> if the underlying index throws an <seealso cref="IOException"/> </exception>
+ /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
+ /// <returns> String[] the sorted list of the suggest words with these 2 criteria:
+ /// first criteria: the edit distance, second criteria (only if restricted mode): the popularity
+ /// of the suggest words in the field of the user index
+ /// </returns>
+ public virtual string[] SuggestSimilar(string word, int numSug, IndexReader ir, string field, SuggestMode suggestMode, float accuracy)
+ {
+ // obtainSearcher calls ensureOpen
+ IndexSearcher indexSearcher = ObtainSearcher();
+ try
+ {
+ if (ir == null || field == null)
+ {
+ suggestMode = SuggestMode.SUGGEST_ALWAYS;
+ }
+ if (suggestMode == SuggestMode.SUGGEST_ALWAYS)
+ {
+ ir = null;
+ field = null;
+ }
+
+ int lengthWord = word.Length;
+
+ int freq = (ir != null && field != null) ? ir.DocFreq(new Term(field, word)) : 0;
+ int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
+ // if the word exists in the real index and we don't care for word frequency, return the word itself
+ if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0)
+ {
+ return new string[] { word };
+ }
+
+ BooleanQuery query = new BooleanQuery();
+ string[] grams;
+ string key;
+
+ for (int ng = GetMin(lengthWord); ng <= GetMax(lengthWord); ng++)
+ {
+
+ key = "gram" + ng; // form key
+
+ grams = FormGrams(word, ng); // form word into ngrams (allow dups too)
+
+ if (grams.Length == 0)
+ {
+ continue; // hmm
+ }
+
+ if (bStart > 0) // should we boost prefixes?
+ {
+ Add(query, "start" + ng, grams[0], bStart); // matches start of word
+
+ }
+ if (bEnd > 0) // should we boost suffixes
+ {
+ Add(query, "end" + ng, grams[grams.Length - 1], bEnd); // matches end of word
+
+ }
+ for (int i = 0; i < grams.Length; i++)
+ {
+ Add(query, key, grams[i]);
+ }
+ }
+
+ int maxHits = 10 * numSug;
+
+ // System.out.println("Q: " + query);
+ ScoreDoc[] hits = indexSearcher.Search(query, null, maxHits).ScoreDocs;
+ // System.out.println("HITS: " + hits.length());
+ SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
+
+ // go thru more than 'maxr' matches in case the distance filter triggers
+ int stop = Math.Min(hits.Length, maxHits);
+ SuggestWord sugWord = new SuggestWord();
+ for (int i = 0; i < stop; i++)
+ {
+
+ sugWord.@string = indexSearcher.Doc(hits[i].Doc).Get(F_WORD); // get orig word
+
+ // don't suggest a word for itself, that would be silly
+ if (sugWord.@string.Equals(word))
+ {
+ continue;
+ }
+
+ // edit distance
+ sugWord.score = sd.GetDistance(word, sugWord.@string);
+ if (sugWord.score < accuracy)
+ {
+ continue;
+ }
+
+ if (ir != null && field != null) // use the user index
+ {
+ sugWord.freq = ir.DocFreq(new Term(field, sugWord.@string)); // freq in the index
+ // don't suggest a word that is not present in the field
+ if ((suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1)
+ {
+ continue;
+ }
+ }
+ sugQueue.InsertWithOverflow(sugWord);
+ if (sugQueue.Size() == numSug)
+ {
+ // if queue full, maintain the minScore score
+ accuracy = sugQueue.Top().score;
+ }
+ sugWord = new SuggestWord();
+ }
+
+ // convert to array string
+ string[] list = new string[sugQueue.Size()];
+ for (int i = sugQueue.Size() - 1; i >= 0; i--)
+ {
+ list[i] = sugQueue.Pop().@string;
+ }
+
+ return list;
+ }
+ finally
+ {
+ ReleaseSearcher(indexSearcher);
+ }
+ }
+ /// <summary>
+ /// Add a clause to a boolean query.
+ /// </summary>
+ private static void Add(BooleanQuery q, string name, string value, float boost)
+ {
+ Query tq = new TermQuery(new Term(name, value));
+ tq.Boost = boost;
+ q.Add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
+ }
+
+ /// <summary>
+ /// Add a clause to a boolean query.
+ /// </summary>
+ private static void Add(BooleanQuery q, string name, string value)
+ {
+ q.Add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
+ }
+
+ /// <summary>
+ /// Form all ngrams for a given word. </summary>
+ /// <param name="text"> the word to parse </param>
+ /// <param name="ng"> the ngram length e.g. 3 </param>
+ /// <returns> an array of all ngrams in the word and note that duplicates are not removed </returns>
+ private static string[] FormGrams(string text, int ng)
+ {
+ int len = text.Length;
+ string[] res = new string[len - ng + 1];
+ for (int i = 0; i < len - ng + 1; i++)
+ {
+ res[i] = text.Substring(i, ng);
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// Removes all terms from the spell check index. </summary>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+ /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
+ public virtual void ClearIndex()
+ {
+ lock (modifyCurrentIndexLock)
+ {
+ EnsureOpen();
+ var dir = this.spellIndex;
+ using (var writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, null)
+ .SetOpenMode(IndexWriterConfig.OpenMode.CREATE))) { }
+ SwapSearcher(dir);
+ }
+ }
+
+ /// <summary>
+ /// Check whether the word exists in the index. </summary>
+ /// <param name="word"> word to check </param>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+ /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
+ /// <returns> true if the word exists in the index </returns>
+ public virtual bool Exist(string word)
+ {
+ // obtainSearcher calls ensureOpen
+ IndexSearcher indexSearcher = ObtainSearcher();
+ try
+ {
+ // TODO: we should use ReaderUtil+seekExact, we dont care about the docFreq
+ // this is just an existence check
+ return indexSearcher.IndexReader.DocFreq(new Term(F_WORD, word)) > 0;
+ }
+ finally
+ {
+ ReleaseSearcher(indexSearcher);
+ }
+ }
+
+ /// <summary>
+ /// Indexes the data from the given <seealso cref="Dictionary"/>. </summary>
+ /// <param name="dict"> Dictionary to index </param>
+ /// <param name="config"> <seealso cref="IndexWriterConfig"/> to use </param>
+ /// <param name="fullMerge"> whether or not the spellcheck index should be fully merged </param>
+ /// <exception cref="AlreadyClosedException"> if the Spellchecker is already closed </exception>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+ public void IndexDictionary(Dictionary dict, IndexWriterConfig config, bool fullMerge)
+ {
+ lock (modifyCurrentIndexLock)
+ {
+ EnsureOpen();
+ Directory dir = this.spellIndex;
+ using (var writer = new IndexWriter(dir, config))
+ {
+ IndexSearcher indexSearcher = ObtainSearcher();
+ IList<TermsEnum> termsEnums = new List<TermsEnum>();
+
+ IndexReader reader = searcher.IndexReader;
+ if (reader.MaxDoc() > 0)
+ {
+ foreach (AtomicReaderContext ctx in reader.Leaves())
+ {
+ Terms terms = ctx.Reader().Terms(F_WORD);
+ if (terms != null)
+ {
+ termsEnums.Add(terms.Iterator(null));
+ }
+ }
+ }
+
+ bool isEmpty = termsEnums.Count == 0;
+
+ try
+ {
+ BytesRefIterator iter = dict.EntryIterator;
+ BytesRef currentTerm;
+
+ while ((currentTerm = iter.Next()) != null)
+ {
+
+ string word = currentTerm.Utf8ToString();
+ int len = word.Length;
+ if (len < 3)
+ {
+ continue; // too short we bail but "too long" is fine...
+ }
+
+ if (!isEmpty)
+ {
+ foreach (TermsEnum te in termsEnums)
+ {
+ if (te.SeekExact(currentTerm))
+ {
+ goto termsContinue;
+ }
+ }
+ }
+
+ // ok index the word
+ var doc = CreateDocument(word, GetMin(len), GetMax(len));
+ writer.AddDocument(doc);
+ termsContinue:
+ ;
+ }
+ termsBreak:
+ ;
+ }
+ finally
+ {
+ ReleaseSearcher(indexSearcher);
+ }
+ if (fullMerge)
+ {
+ writer.ForceMerge(1);
+ }
+ }
+ // TODO: this isn't that great, maybe in the future SpellChecker should take
+ // IWC in its ctor / keep its writer open?
+
+ // also re-open the spell index to see our own changes when the next suggestion
+ // is fetched:
+ SwapSearcher(dir);
+ }
+ }
+
+ private static int GetMin(int l)
+ {
+ if (l > 5)
+ {
+ return 3;
+ }
+ if (l == 5)
+ {
+ return 2;
+ }
+ return 1;
+ }
+
+ private static int GetMax(int l)
+ {
+ if (l > 5)
+ {
+ return 4;
+ }
+ if (l == 5)
+ {
+ return 3;
+ }
+ return 2;
+ }
+
+ private static Document CreateDocument(string text, int ng1, int ng2)
+ {
+ var doc = new Document();
+ // the word field is never queried on... its indexed so it can be quickly
+ // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
+ Field f = new StringField(F_WORD, text, Field.Store.YES);
+ doc.Add(f); // orig term
+ AddGram(text, doc, ng1, ng2);
+ return doc;
+ }
+
+ private static void AddGram(string text, Document doc, int ng1, int ng2)
+ {
+ int len = text.Length;
+ for (int ng = ng1; ng <= ng2; ng++)
+ {
+ string key = "gram" + ng;
+ string end = null;
+ for (int i = 0; i < len - ng + 1; i++)
+ {
+ string gram = text.Substring(i, ng);
+ FieldType ft = new FieldType(StringField.TYPE_NOT_STORED);
+ ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS;
+ Field ngramField = new Field(key, gram, ft);
+ // spellchecker does not use positional queries, but we want freqs
+ // for scoring these multivalued n-gram fields.
+ doc.Add(ngramField);
+ if (i == 0)
+ {
+ // only one term possible in the startXXField, TF/pos and norms aren't needed.
+ Field startField = new StringField("start" + ng, gram, Field.Store.NO);
+ doc.Add(startField);
+ }
+ end = gram;
+ }
+ if (end != null) // may not be present if len==ng1
+ {
+ // only one term possible in the endXXField, TF/pos and norms aren't needed.
+ Field endField = new StringField("end" + ng, end, Field.Store.NO);
+ doc.Add(endField);
+ }
+ }
+ }
+
+ private IndexSearcher ObtainSearcher()
+ {
+ lock (searcherLock)
+ {
+ EnsureOpen();
+ searcher.IndexReader.IncRef();
+ return searcher;
+ }
+ }
+
+ private void ReleaseSearcher(IndexSearcher aSearcher)
+ {
+ // don't check if open - always decRef
+ // don't decrement the private searcher - could have been swapped
+ aSearcher.IndexReader.DecRef();
+ }
+
+ private void EnsureOpen()
+ {
+ if (closed)
+ {
+ throw new AlreadyClosedException("Spellchecker has been closed");
+ }
+ }
+
+ /// <summary>
+ /// Close the IndexSearcher used by this SpellChecker </summary>
+ /// <exception cref="IOException"> if the close operation causes an <seealso cref="IOException"/> </exception>
+ /// <exception cref="AlreadyClosedException"> if the <seealso cref="SpellChecker"/> is already closed </exception>
+ public void Dispose()
+ {
+ lock (searcherLock)
+ {
+ EnsureOpen();
+ closed = true;
+ if (searcher != null)
+ {
+ searcher.IndexReader.Dispose();
+ }
+ searcher = null;
+ }
+ }
+
+ private void SwapSearcher(Directory dir)
+ {
+ /*
+ * opening a searcher is possibly very expensive.
+ * We rather close it again if the Spellchecker was closed during
+ * this operation than block access to the current searcher while opening.
+ */
+ IndexSearcher indexSearcher = CreateSearcher(dir);
+ lock (searcherLock)
+ {
+ if (closed)
+ {
+ indexSearcher.IndexReader.Dispose();
+ throw new AlreadyClosedException("Spellchecker has been closed");
+ }
+ if (searcher != null)
+ {
+ searcher.IndexReader.Dispose();
+ }
+ // set the spellindex in the sync block - ensure consistency.
+ searcher = indexSearcher;
+ this.spellIndex = dir;
+ }
+ }
+
+ /// <summary>
+ /// Creates a new read-only IndexSearcher </summary>
+ /// <param name="dir"> the directory used to open the searcher </param>
+ /// <returns> a new read-only IndexSearcher </returns>
+ /// <exception cref="IOException"> f there is a low-level IO error </exception>
+ // for testing purposes
+ internal virtual IndexSearcher CreateSearcher(Directory dir)
+ {
+ return new IndexSearcher(DirectoryReader.Open(dir));
+ }
+
+ /// <summary>
+ /// Returns <code>true</code> if and only if the <seealso cref="SpellChecker"/> is
+ /// closed, otherwise <code>false</code>.
+ /// </summary>
+ /// <returns> <code>true</code> if and only if the <seealso cref="SpellChecker"/> is
+ /// closed, otherwise <code>false</code>. </returns>
+ internal virtual bool Closed
+ {
+ get
+ {
+ return closed;
+ }
+ }
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/StringDistance.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/StringDistance.cs b/src/Lucene.Net.Suggest/Spell/StringDistance.cs
new file mode 100644
index 0000000..d50a9b4
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/StringDistance.cs
@@ -0,0 +1,36 @@
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Interface for string distances.
+ /// </summary>
+ public interface StringDistance
+ {
+
+ /// <summary>
+ /// Returns a float between 0 and 1 based on how similar the specified strings are to one another.
+ /// Returning a value of 1 means the specified strings are identical and 0 means the
+ /// string are maximally different. </summary>
+ /// <param name="s1"> The first string. </param>
+ /// <param name="s2"> The second string. </param>
+ /// <returns> a float between 0 and 1 based on how similar the specified strings are to one another. </returns>
+ float GetDistance(string s1, string s2);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/SuggestMode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/SuggestMode.cs b/src/Lucene.Net.Suggest/Spell/SuggestMode.cs
new file mode 100644
index 0000000..f277323
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/SuggestMode.cs
@@ -0,0 +1,46 @@
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Set of strategies for suggesting related terms
+ /// @lucene.experimental
+ /// </summary>
+ public enum SuggestMode
+ {
+ /// <summary>
+ /// Generate suggestions only for terms not in the index (default)
+ /// </summary>
+ SUGGEST_WHEN_NOT_IN_INDEX,
+
+ /// <summary>
+ /// Return only suggested words that are as frequent or more frequent than the
+ /// searched word
+ /// </summary>
+ SUGGEST_MORE_POPULAR,
+
+ /// <summary>
+ /// Always attempt to offer suggestions (however, other parameters may limit
+ /// suggestions. For example, see
+ /// <seealso cref="DirectSpellChecker#setMaxQueryFrequency(float)"/> ).
+ /// </summary>
+ SUGGEST_ALWAYS
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/SuggestWord.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/SuggestWord.cs b/src/Lucene.Net.Suggest/Spell/SuggestWord.cs
new file mode 100644
index 0000000..35aea47
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/SuggestWord.cs
@@ -0,0 +1,53 @@
+namespace Lucene.Net.Search.Spell
+{
+
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// SuggestWord, used in suggestSimilar method in SpellChecker class.
+ /// <p/>
+ /// Default sort is first by score, then by frequency.
+ /// </summary>
+ public sealed class SuggestWord
+ {
+
+ /// <summary>
+ /// Creates a new empty suggestion with null text.
+ /// </summary>
+ public SuggestWord()
+ {
+ }
+
+ /// <summary>
+ /// the score of the word
+ /// </summary>
+ public float score;
+
+ /// <summary>
+ /// The freq of the word
+ /// </summary>
+ public int freq;
+
+ /// <summary>
+ /// the suggested word
+ /// </summary>
+ public string @string;
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/SuggestWordFrequencyComparator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/SuggestWordFrequencyComparator.cs b/src/Lucene.Net.Suggest/Spell/SuggestWordFrequencyComparator.cs
new file mode 100644
index 0000000..3e7abd1
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/SuggestWordFrequencyComparator.cs
@@ -0,0 +1,64 @@
+using System.Collections.Generic;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ /// <summary>
+ /// Frequency first, then score.
+ /// </summary>
+ public class SuggestWordFrequencyComparator : IComparer<SuggestWord>
+ {
+
+ /// <summary>
+ /// Creates a new comparator that will compare by <seealso cref="SuggestWord#freq"/>,
+ /// then by <seealso cref="SuggestWord#score"/>, then by <seealso cref="SuggestWord#string"/>.
+ /// </summary>
+ public SuggestWordFrequencyComparator()
+ {
+ }
+
+ public virtual int Compare(SuggestWord first, SuggestWord second)
+ {
+ // first criteria: the frequency
+ if (first.freq > second.freq)
+ {
+ return 1;
+ }
+ if (first.freq < second.freq)
+ {
+ return -1;
+ }
+
+ // second criteria (if first criteria is equal): the score
+ if (first.score > second.score)
+ {
+ return 1;
+ }
+ if (first.score < second.score)
+ {
+ return -1;
+ }
+ // third criteria: term text
+ return second.@string.CompareTo(first.@string);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/SuggestWordQueue.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/SuggestWordQueue.cs b/src/Lucene.Net.Suggest/Spell/SuggestWordQueue.cs
new file mode 100644
index 0000000..d46a524
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/SuggestWordQueue.cs
@@ -0,0 +1,65 @@
+using System.Collections.Generic;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Spell
+{
+
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Sorts SuggestWord instances
+ /// </summary>
+ /// <seealso cref= org.apache.lucene.search.spell.SuggestWordScoreComparator </seealso>
+ /// <seealso cref= org.apache.lucene.search.spell.SuggestWordFrequencyComparator
+ /// </seealso>
+ public sealed class SuggestWordQueue : PriorityQueue<SuggestWord>
+ {
+ /// <summary>
+ /// Default comparator: score then frequency. </summary>
+ /// <seealso cref= SuggestWordScoreComparator </seealso>
+ public static readonly IComparer<SuggestWord> DEFAULT_COMPARATOR = new SuggestWordScoreComparator();
+
+
+ private readonly IComparer<SuggestWord> comparator;
+
+ /// <summary>
+ /// Use the <seealso cref="#DEFAULT_COMPARATOR"/> </summary>
+ /// <param name="size"> The size of the queue </param>
+ public SuggestWordQueue(int size)
+ : base(size)
+ {
+ comparator = DEFAULT_COMPARATOR;
+ }
+
+ /// <summary>
+ /// Specify the size of the queue and the comparator to use for sorting. </summary>
+ /// <param name="size"> The size </param>
+ /// <param name="comparator"> The comparator. </param>
+ public SuggestWordQueue(int size, IComparer<SuggestWord> comparator)
+ : base(size)
+ {
+ this.comparator = comparator;
+ }
+
+ public override bool LessThan(SuggestWord wa, SuggestWord wb)
+ {
+ int val = comparator.Compare(wa, wb);
+ return val < 0;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/SuggestWordScoreComparator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/SuggestWordScoreComparator.cs b/src/Lucene.Net.Suggest/Spell/SuggestWordScoreComparator.cs
new file mode 100644
index 0000000..d626d91
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/SuggestWordScoreComparator.cs
@@ -0,0 +1,64 @@
+using System.Collections.Generic;
+
+namespace Lucene.Net.Search.Spell
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ /// <summary>
+ /// Score first, then frequency
+ /// </summary>
+ public class SuggestWordScoreComparator : IComparer<SuggestWord>
+ {
+
+ /// <summary>
+ /// Creates a new comparator that will compare by <seealso cref="SuggestWord#score"/>,
+ /// then by <seealso cref="SuggestWord#freq"/>, then by <seealso cref="SuggestWord#string"/>.
+ /// </summary>
+ public SuggestWordScoreComparator()
+ {
+ }
+
+ public virtual int Compare(SuggestWord first, SuggestWord second)
+ {
+ // first criteria: the distance
+ if (first.score > second.score)
+ {
+ return 1;
+ }
+ if (first.score < second.score)
+ {
+ return -1;
+ }
+
+ // second criteria (if first criteria is equal): the popularity
+ if (first.freq > second.freq)
+ {
+ return 1;
+ }
+
+ if (first.freq < second.freq)
+ {
+ return -1;
+ }
+ // third criteria: term text
+ return second.@string.CompareTo(first.@string);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/TermFreqIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/TermFreqIterator.cs b/src/Lucene.Net.Suggest/Spell/TermFreqIterator.cs
new file mode 100644
index 0000000..5414173
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/TermFreqIterator.cs
@@ -0,0 +1,68 @@
+using System.Collections.Generic;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Interface for enumerating term,weight pairs.
+ /// </summary>
+ public interface TermFreqIterator : BytesRefIterator
+ {
+
+ /// <summary>
+ /// Gets the term's weight, higher numbers mean better suggestions.
+ /// </summary>
+ long Weight { get; }
+ }
+
+ /// <summary>
+ /// Wraps a BytesRefIterator as a TermFreqIterator, with all weights
+ /// set to <code>1</code>
+ /// </summary>
+ public class TermFreqIteratorWrapper : TermFreqIterator
+ {
+ internal BytesRefIterator wrapped;
+
+ /// <summary>
+ /// Creates a new wrapper, wrapping the specified iterator and
+ /// specifying a weight value of <code>1</code> for all terms.
+ /// </summary>
+ public TermFreqIteratorWrapper(BytesRefIterator wrapped)
+ {
+ this.wrapped = wrapped;
+ }
+
+ public virtual long Weight
+ {
+ get { return 1; }
+ }
+
+ public BytesRef Next()
+ {
+ return wrapped.Next();
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get { return wrapped.Comparator; }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/WordBreakSpellChecker.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/WordBreakSpellChecker.cs b/src/Lucene.Net.Suggest/Spell/WordBreakSpellChecker.cs
new file mode 100644
index 0000000..03cb212
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/WordBreakSpellChecker.cs
@@ -0,0 +1,542 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Index;
+using Lucene.Net.Support;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// <para>
+ /// A spell checker whose sole function is to offer suggestions by combining
+ /// multiple terms into one word and/or breaking terms into multiple words.
+ /// </para>
+ /// </summary>
+ public class WordBreakSpellChecker
+ {
+ private int minSuggestionFrequency = 1;
+ private int minBreakWordLength = 1;
+ private int maxCombineWordLength = 20;
+ private int maxChanges = 1;
+ private int maxEvaluations = 1000;
+
+ /// <summary>
+ /// Term that can be used to prohibit adjacent terms from being combined </summary>
+ public static readonly Term SEPARATOR_TERM = new Term("", "");
+
+ /// <summary>
+ /// Creates a new spellchecker with default configuration values </summary>
+ /// <seealso cref= #setMaxChanges(int) </seealso>
+ /// <seealso cref= #setMaxCombineWordLength(int) </seealso>
+ /// <seealso cref= #setMaxEvaluations(int) </seealso>
+ /// <seealso cref= #setMinBreakWordLength(int) </seealso>
+ /// <seealso cref= #setMinSuggestionFrequency(int) </seealso>
+ public WordBreakSpellChecker()
+ {
+ }
+
+ /// <summary>
+ /// <para>
+ /// Determines the order to list word break suggestions
+ /// </para>
+ /// </summary>
+ public enum BreakSuggestionSortMethod
+ {
+ /// <summary>
+ /// <para>
+ /// Sort by Number of word breaks, then by the Sum of all the component
+ /// term's frequencies
+ /// </para>
+ /// </summary>
+ NUM_CHANGES_THEN_SUMMED_FREQUENCY,
+ /// <summary>
+ /// <para>
+ /// Sort by Number of word breaks, then by the Maximum of all the component
+ /// term's frequencies
+ /// </para>
+ /// </summary>
+ NUM_CHANGES_THEN_MAX_FREQUENCY
+ }
+
+ /// <summary>
+ /// <para>
+ /// Generate suggestions by breaking the passed-in term into multiple words.
+ /// The scores returned are equal to the number of word breaks needed so a
+ /// lower score is generally preferred over a higher score.
+ /// </para>
+ /// </summary>
+ /// <param name="suggestMode">
+ /// - default = <seealso cref="SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX"/> </param>
+ /// <param name="sortMethod">
+ /// - default =
+ /// <seealso cref="BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY"/> </param>
+ /// <returns> one or more arrays of words formed by breaking up the original term </returns>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+ public virtual SuggestWord[][] SuggestWordBreaks(Term term, int maxSuggestions, IndexReader ir, SuggestMode suggestMode, BreakSuggestionSortMethod sortMethod)
+ {
+ if (maxSuggestions < 1)
+ {
+ return new SuggestWord[0][];
+ }
+ if (suggestMode == null)
+ {
+ suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
+ }
+ if (sortMethod == null)
+ {
+ sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
+ }
+
+ int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
+ IComparer<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator(this) : new LengthThenSumFreqComparator(this);
+ LinkedList<SuggestWordArrayWrapper> suggestions = new PriorityQueue<SuggestWordArrayWrapper>(queueInitialCapacity, queueComparator);
+
+ int origFreq = ir.DocFreq(term);
+ if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)
+ {
+ return new SuggestWord[0][];
+ }
+
+ int useMinSuggestionFrequency = minSuggestionFrequency;
+ if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR)
+ {
+ useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
+ }
+
+ GenerateBreakUpSuggestions(term, ir, 1, maxSuggestions, useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0, sortMethod);
+
+ SuggestWord[][] suggestionArray = new SuggestWord[suggestions.Count][];
+ for (int i = suggestions.Count - 1; i >= 0; i--)
+ {
+ suggestionArray[i] = suggestions.RemoveFirst().SuggestWords;
+ }
+
+ return suggestionArray;
+ }
+
+ /// <summary>
+ /// <para>
+ /// Generate suggestions by combining one or more of the passed-in terms into
+ /// single words. The returned <seealso cref="CombineSuggestion"/> contains both a
+ /// <seealso cref="SuggestWord"/> and also an array detailing which passed-in terms were
+ /// involved in creating this combination. The scores returned are equal to the
+ /// number of word combinations needed, also one less than the length of the
+ /// array <seealso cref="CombineSuggestion#originalTermIndexes"/>. Generally, a
+ /// suggestion with a lower score is preferred over a higher score.
+ /// </para>
+ /// <para>
+ /// To prevent two adjacent terms from being combined (for instance, if one is
+ /// mandatory and the other is prohibited), separate the two terms with
+ /// <seealso cref="WordBreakSpellChecker#SEPARATOR_TERM"/>
+ /// </para>
+ /// <para>
+ /// When suggestMode equals <seealso cref="SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX"/>, each
+ /// suggestion will include at least one term not in the index.
+ /// </para>
+ /// <para>
+ /// When suggestMode equals <seealso cref="SuggestMode#SUGGEST_MORE_POPULAR"/>, each
+ /// suggestion will have the same, or better frequency than the most-popular
+ /// included term.
+ /// </para>
+ /// </summary>
+ /// <returns> an array of words generated by combining original terms </returns>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+ public virtual CombineSuggestion[] SuggestWordCombinations(Term[] terms, int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
+ {
+ if (maxSuggestions < 1)
+ {
+ return new CombineSuggestion[0];
+ }
+
+ int[] origFreqs = null;
+ if (suggestMode != SuggestMode.SUGGEST_ALWAYS)
+ {
+ origFreqs = new int[terms.Length];
+ for (int i = 0; i < terms.Length; i++)
+ {
+ origFreqs[i] = ir.DocFreq(terms[i]);
+ }
+ }
+
+ int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
+ IComparer<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator(this);
+ LinkedList<CombineSuggestionWrapper> suggestions = new PriorityQueue<CombineSuggestionWrapper>(queueInitialCapacity, queueComparator);
+
+ int thisTimeEvaluations = 0;
+ for (int i = 0; i < terms.Length - 1; i++)
+ {
+ if (terms[i].Equals(SEPARATOR_TERM))
+ {
+ continue;
+ }
+ string leftTermText = terms[i].Text();
+ int leftTermLength = leftTermText.CodePointCount(0, leftTermText.Length);
+ if (leftTermLength > maxCombineWordLength)
+ {
+ continue;
+ }
+ int maxFreq = 0;
+ int minFreq = int.MaxValue;
+ if (origFreqs != null)
+ {
+ maxFreq = origFreqs[i];
+ minFreq = origFreqs[i];
+ }
+ string combinedTermText = leftTermText;
+ int combinedLength = leftTermLength;
+ for (int j = i + 1; j < terms.Length && j - i <= maxChanges; j++)
+ {
+ if (terms[j].Equals(SEPARATOR_TERM))
+ {
+ break;
+ }
+ string rightTermText = terms[j].Text();
+ int rightTermLength = rightTermText.CodePointCount(0, rightTermText.Length);
+ combinedTermText += rightTermText;
+ combinedLength += rightTermLength;
+ if (combinedLength > maxCombineWordLength)
+ {
+ break;
+ }
+
+ if (origFreqs != null)
+ {
+ maxFreq = Math.Max(maxFreq, origFreqs[j]);
+ minFreq = Math.Min(minFreq, origFreqs[j]);
+ }
+
+ Term combinedTerm = new Term(terms[0].Field(), combinedTermText);
+ int combinedTermFreq = ir.DocFreq(combinedTerm);
+
+ if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq)
+ {
+ if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0)
+ {
+ if (combinedTermFreq >= minSuggestionFrequency)
+ {
+ int[] origIndexes = new int[j - i + 1];
+ origIndexes[0] = i;
+ for (int k = 1; k < origIndexes.Length; k++)
+ {
+ origIndexes[k] = i + k;
+ }
+ SuggestWord word = new SuggestWord();
+ word.freq = combinedTermFreq;
+ word.score = origIndexes.Length - 1;
+ word.@string = combinedTerm.Text();
+ CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(this, new CombineSuggestion(word, origIndexes), (origIndexes.Length - 1));
+ suggestions.AddLast(suggestion);
+ if (suggestions.Count > maxSuggestions)
+ {
+ suggestions.RemoveFirst();
+ }
+ }
+ }
+ }
+ thisTimeEvaluations++;
+ if (thisTimeEvaluations == maxEvaluations)
+ {
+ break;
+ }
+ }
+ }
+ CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.Count];
+ for (int i = suggestions.Count - 1; i >= 0; i--)
+ {
+ combineSuggestions[i] = suggestions.RemoveFirst().CombineSuggestion;
+ }
+ return combineSuggestions;
+ }
+
+ private int GenerateBreakUpSuggestions(Term term, IndexReader ir, int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency, SuggestWord[] prefix, LinkedList<SuggestWordArrayWrapper> suggestions, int totalEvaluations, BreakSuggestionSortMethod sortMethod)
+ {
+ string termText = term.Text();
+ int termLength = termText.CodePointCount(0, termText.Length);
+ int useMinBreakWordLength = minBreakWordLength;
+ if (useMinBreakWordLength < 1)
+ {
+ useMinBreakWordLength = 1;
+ }
+ if (termLength < (useMinBreakWordLength * 2))
+ {
+ return 0;
+ }
+
+ int thisTimeEvaluations = 0;
+ for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++)
+ {
+ int end = termText.OffsetByCodePoints(0, i);
+ string leftText = termText.Substring(0, end);
+ string rightText = termText.Substring(end);
+ SuggestWord leftWord = GenerateSuggestWord(ir, term.Field(), leftText);
+
+ if (leftWord.freq >= useMinSuggestionFrequency)
+ {
+ SuggestWord rightWord = GenerateSuggestWord(ir, term.Field(), rightText);
+ if (rightWord.freq >= useMinSuggestionFrequency)
+ {
+ SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(this, NewSuggestion(prefix, leftWord, rightWord));
+ suggestions.AddLast(suggestion);
+ if (suggestions.Count > maxSuggestions)
+ {
+ suggestions.RemoveFirst();
+ }
+ }
+ int newNumberBreaks = numberBreaks + 1;
+ if (newNumberBreaks <= maxChanges)
+ {
+ int evaluations = GenerateBreakUpSuggestions(new Term(term.Field(), rightWord.@string), ir, newNumberBreaks, maxSuggestions, useMinSuggestionFrequency, NewPrefix(prefix, leftWord), suggestions, totalEvaluations, sortMethod);
+ totalEvaluations += evaluations;
+ }
+ }
+
+ thisTimeEvaluations++;
+ totalEvaluations++;
+ if (totalEvaluations >= maxEvaluations)
+ {
+ break;
+ }
+ }
+ return thisTimeEvaluations;
+ }
+
+ private static SuggestWord[] NewPrefix(SuggestWord[] oldPrefix, SuggestWord append)
+ {
+ SuggestWord[] newPrefix = new SuggestWord[oldPrefix.Length + 1];
+ Array.Copy(oldPrefix, 0, newPrefix, 0, oldPrefix.Length);
+ newPrefix[newPrefix.Length - 1] = append;
+ return newPrefix;
+ }
+
+ private static SuggestWord[] NewSuggestion(SuggestWord[] prefix, SuggestWord append1, SuggestWord append2)
+ {
+ SuggestWord[] newSuggestion = new SuggestWord[prefix.Length + 2];
+ int score = prefix.Length + 1;
+ for (int i = 0; i < prefix.Length; i++)
+ {
+ SuggestWord word = new SuggestWord();
+ word.@string = prefix[i].@string;
+ word.freq = prefix[i].freq;
+ word.score = score;
+ newSuggestion[i] = word;
+ }
+ append1.score = score;
+ append2.score = score;
+ newSuggestion[newSuggestion.Length - 2] = append1;
+ newSuggestion[newSuggestion.Length - 1] = append2;
+ return newSuggestion;
+ }
+
+ private SuggestWord GenerateSuggestWord(IndexReader ir, string fieldname, string text)
+ {
+ Term term = new Term(fieldname, text);
+ int freq = ir.DocFreq(term);
+ SuggestWord word = new SuggestWord();
+ word.freq = freq;
+ word.score = 1;
+ word.@string = text;
+ return word;
+ }
+
+ /// <summary>
+ /// Returns the minimum frequency a term must have
+ /// to be part of a suggestion. </summary>
+ /// <seealso cref= #setMinSuggestionFrequency(int) </seealso>
+ public virtual int MinSuggestionFrequency
+ {
+ get
+ {
+ return minSuggestionFrequency;
+ }
+ set
+ {
+ this.minSuggestionFrequency = value;
+ }
+ }
+
+ /// <summary>
+ /// Returns the maximum length of a combined suggestion </summary>
+ /// <seealso cref= #setMaxCombineWordLength(int) </seealso>
+ public virtual int MaxCombineWordLength
+ {
+ get
+ {
+ return maxCombineWordLength;
+ }
+ set
+ {
+ this.maxCombineWordLength = value;
+ }
+ }
+
+ /// <summary>
+ /// Returns the minimum size of a broken word </summary>
+ /// <seealso cref= #setMinBreakWordLength(int) </seealso>
+ public virtual int MinBreakWordLength
+ {
+ get
+ {
+ return minBreakWordLength;
+ }
+ set
+ {
+ this.minBreakWordLength = value;
+ }
+ }
+
+ /// <summary>
+ /// Returns the maximum number of changes to perform on the input </summary>
+ /// <seealso cref= #setMaxChanges(int) </seealso>
+ public virtual int MaxChanges
+ {
+ get
+ {
+ return maxChanges;
+ }
+ set
+ {
+ this.maxChanges = value;
+ }
+ }
+
+ /// <summary>
+ /// Returns the maximum number of word combinations to evaluate. </summary>
+ /// <seealso cref= #setMaxEvaluations(int) </seealso>
+ public virtual int MaxEvaluations
+ {
+ get
+ {
+ return maxEvaluations;
+ }
+ set
+ {
+ this.maxEvaluations = value;
+ }
+ }
+
+ private sealed class LengthThenMaxFreqComparator : IComparer<SuggestWordArrayWrapper>
+ {
+ private readonly WordBreakSpellChecker outerInstance;
+
+ public LengthThenMaxFreqComparator(WordBreakSpellChecker outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ public int Compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2)
+ {
+ if (o1.suggestWords.Length != o2.suggestWords.Length)
+ {
+ return o2.suggestWords.Length - o1.suggestWords.Length;
+ }
+ if (o1.freqMax != o2.freqMax)
+ {
+ return o1.freqMax - o2.freqMax;
+ }
+ return 0;
+ }
+ }
+
+ private sealed class LengthThenSumFreqComparator : IComparer<SuggestWordArrayWrapper>
+ {
+ private readonly WordBreakSpellChecker outerInstance;
+
+ public LengthThenSumFreqComparator(WordBreakSpellChecker outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ public int Compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2)
+ {
+ if (o1.suggestWords.Length != o2.suggestWords.Length)
+ {
+ return o2.suggestWords.Length - o1.suggestWords.Length;
+ }
+ if (o1.freqSum != o2.freqSum)
+ {
+ return o1.freqSum - o2.freqSum;
+ }
+ return 0;
+ }
+ }
+
+ private sealed class CombinationsThenFreqComparator : IComparer<CombineSuggestionWrapper>
+ {
+ private readonly WordBreakSpellChecker outerInstance;
+
+ public CombinationsThenFreqComparator(WordBreakSpellChecker outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ public int Compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2)
+ {
+ if (o1.numCombinations != o2.numCombinations)
+ {
+ return o2.numCombinations - o1.numCombinations;
+ }
+ if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq)
+ {
+ return o1.combineSuggestion.suggestion.freq - o2.combineSuggestion.suggestion.freq;
+ }
+ return 0;
+ }
+ }
+
+ private class SuggestWordArrayWrapper
+ {
+ private readonly WordBreakSpellChecker outerInstance;
+
+ internal readonly SuggestWord[] suggestWords;
+ internal readonly int freqMax;
+ internal readonly int freqSum;
+
+ internal SuggestWordArrayWrapper(WordBreakSpellChecker outerInstance, SuggestWord[] suggestWords)
+ {
+ this.outerInstance = outerInstance;
+ this.suggestWords = suggestWords;
+ int aFreqSum = 0;
+ int aFreqMax = 0;
+ foreach (SuggestWord sw in suggestWords)
+ {
+ aFreqSum += sw.freq;
+ aFreqMax = Math.Max(aFreqMax, sw.freq);
+ }
+ this.freqSum = aFreqSum;
+ this.freqMax = aFreqMax;
+ }
+ }
+
+ private class CombineSuggestionWrapper
+ {
+ private readonly WordBreakSpellChecker outerInstance;
+
+ internal readonly CombineSuggestion combineSuggestion;
+ internal readonly int numCombinations;
+
+ internal CombineSuggestionWrapper(WordBreakSpellChecker outerInstance, CombineSuggestion combineSuggestion, int numCombinations)
+ {
+ this.outerInstance = outerInstance;
+ this.combineSuggestion = combineSuggestion;
+ this.numCombinations = numCombinations;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/StringHelperClass.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/StringHelperClass.cs b/src/Lucene.Net.Suggest/StringHelperClass.cs
new file mode 100644
index 0000000..172a21e
--- /dev/null
+++ b/src/Lucene.Net.Suggest/StringHelperClass.cs
@@ -0,0 +1,90 @@
+//-------------------------------------------------------------------------------------------
+// Copyright © 2007 - 2014 Tangible Software Solutions Inc.
+// This class can be used by anyone provided that the copyright notice remains intact.
+//
+// This class is used to convert some aspects of the Java String class.
+//-------------------------------------------------------------------------------------------
+internal static class StringHelperClass
+{
+ //----------------------------------------------------------------------------------
+ // This method replaces the Java String.substring method when 'start' is a
+ // method call or calculated value to ensure that 'start' is obtained just once.
+ //----------------------------------------------------------------------------------
+ internal static string SubstringSpecial(this string self, int start, int end)
+ {
+ return self.Substring(start, end - start);
+ }
+
+ //------------------------------------------------------------------------------------
+ // This method is used to replace calls to the 2-arg Java String.startsWith method.
+ //------------------------------------------------------------------------------------
+ internal static bool StartsWith(this string self, string prefix, int toffset)
+ {
+ return self.IndexOf(prefix, toffset, System.StringComparison.Ordinal) == toffset;
+ }
+
+ //------------------------------------------------------------------------------
+ // This method is used to replace most calls to the Java String.split method.
+ //------------------------------------------------------------------------------
+ internal static string[] Split(this string self, string regexDelimiter, bool trimTrailingEmptyStrings)
+ {
+ string[] splitArray = System.Text.RegularExpressions.Regex.Split(self, regexDelimiter);
+
+ if (trimTrailingEmptyStrings)
+ {
+ if (splitArray.Length > 1)
+ {
+ for (int i = splitArray.Length; i > 0; i--)
+ {
+ if (splitArray[i - 1].Length > 0)
+ {
+ if (i < splitArray.Length)
+ System.Array.Resize(ref splitArray, i);
+
+ break;
+ }
+ }
+ }
+ }
+
+ return splitArray;
+ }
+
+ //-----------------------------------------------------------------------------
+ // These methods are used to replace calls to some Java String constructors.
+ //-----------------------------------------------------------------------------
+ internal static string NewString(sbyte[] bytes)
+ {
+ return NewString(bytes, 0, bytes.Length);
+ }
+ internal static string NewString(sbyte[] bytes, int index, int count)
+ {
+ return System.Text.Encoding.UTF8.GetString((byte[])(object)bytes, index, count);
+ }
+ internal static string NewString(sbyte[] bytes, string encoding)
+ {
+ return NewString(bytes, 0, bytes.Length, encoding);
+ }
+ internal static string NewString(sbyte[] bytes, int index, int count, string encoding)
+ {
+ return System.Text.Encoding.GetEncoding(encoding).GetString((byte[])(object)bytes, index, count);
+ }
+
+ //--------------------------------------------------------------------------------
+ // These methods are used to replace calls to the Java String.getBytes methods.
+ //--------------------------------------------------------------------------------
+ internal static sbyte[] GetBytes(this string self)
+ {
+ return GetSBytesForEncoding(System.Text.Encoding.UTF8, self);
+ }
+ internal static sbyte[] GetBytes(this string self, string encoding)
+ {
+ return GetSBytesForEncoding(System.Text.Encoding.GetEncoding(encoding), self);
+ }
+ private static sbyte[] GetSBytesForEncoding(System.Text.Encoding encoding, string s)
+ {
+ sbyte[] sbytes = new sbyte[encoding.GetByteCount(s)];
+ encoding.GetBytes(s, 0, s.Length, (byte[])(object)sbytes, 0);
+ return sbytes;
+ }
+}
\ No newline at end of file