You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/09/16 00:24:52 UTC
[5/8] Porting Lucene.Net.Suggest (still not compiling)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs
new file mode 100644
index 0000000..bb1061f
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingInfixSuggester.cs
@@ -0,0 +1,792 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Codecs.Lucene46;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Directory = Lucene.Net.Store.Directory;
+using Version = System.Version;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // TODO:
+ // - a PostingsFormat that stores super-high-freq terms as
+ // a bitset should be a win for the prefix terms?
+ // (LUCENE-5052)
+ // - we could offer a better integration with
+ // DocumentDictionary and NRT? so that your suggester
+ // "automatically" keeps in sync w/ your index
+
+ /// <summary>
+ /// Analyzes the input text and then suggests matches based
+ /// on prefix matches to any tokens in the indexed text.
+ /// This also highlights the tokens that match.
+ ///
+ /// <para>This suggester supports payloads. Matches are sorted only
+ /// by the suggest weight; it would be nice to support
+ /// blended score + weight sort in the future. This means
+ /// this suggester best applies when there is a strong
+ /// a-priori ranking of all the suggestions.
+ ///
+ /// </para>
+ /// <para>This suggester supports contexts, however the
+ /// contexts must be valid utf8 (arbitrary binary terms will
+ /// not work).
+ ///
+ /// @lucene.experimental
+ /// </para>
+ /// </summary>
+
+ public class AnalyzingInfixSuggester : Lookup, IDisposable
+ {
+
+ /// <summary>
+ /// Field name used for the indexed text. </summary>
+ protected internal const string TEXT_FIELD_NAME = "text";
+
+ /// <summary>
+ /// Field name used for the indexed text, as a
+ /// StringField, for exact lookup.
+ /// </summary>
+ protected internal const string EXACT_TEXT_FIELD_NAME = "exacttext";
+
+ /// <summary>
+ /// Field name used for the indexed context, as a
+ /// StringField and a SortedSetDVField, for filtering.
+ /// </summary>
+ protected internal const string CONTEXTS_FIELD_NAME = "contexts";
+
+ /// <summary>
+ /// Analyzer used at search time </summary>
+ protected internal readonly Analyzer queryAnalyzer;
+ /// <summary>
+ /// Analyzer used at index time </summary>
+ protected internal readonly Analyzer indexAnalyzer;
+ internal readonly Version matchVersion;
+ private readonly Directory dir;
+ internal readonly int minPrefixChars;
+
+ /// <summary>
+ /// Used for ongoing NRT additions/updates. </summary>
+ private IndexWriter writer;
+
+ /// <summary>
+ /// <seealso cref="IndexSearcher"/> used for lookups. </summary>
+ protected internal SearcherManager searcherMgr;
+
+ /// <summary>
+ /// Default minimum number of leading characters before
+ /// PrefixQuery is used (4).
+ /// </summary>
+ public const int DEFAULT_MIN_PREFIX_CHARS = 4;
+
+ /// <summary>
+ /// How we sort the postings and search results. </summary>
+ private static readonly Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true));
+
+ /// <summary>
+ /// Create a new instance, loading from a previously built
+ /// AnalyzingInfixSuggester directory, if it exists. This directory must be
+ /// private to the infix suggester (i.e., not an external
+ /// Lucene index). Note that <seealso cref="#close"/>
+ /// will also close the provided directory.
+ /// </summary>
+ public AnalyzingInfixSuggester(Version matchVersion, Directory dir, Analyzer analyzer)
+ : this(matchVersion, dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS)
+ {
+ }
+
+ /// <summary>
+ /// Create a new instance, loading from a previously built
+ /// AnalyzingInfixSuggester directory, if it exists. This directory must be
+ /// private to the infix suggester (i.e., not an external
+ /// Lucene index). Note that <seealso cref="#close"/>
+ /// will also close the provided directory.
+ /// </summary>
+ /// <param name="minPrefixChars"> Minimum number of leading characters
+ /// before PrefixQuery is used (default 4).
+ /// Prefixes shorter than this are indexed as character
+ /// ngrams (increasing index size but making lookups
+ /// faster). </param>
+ public AnalyzingInfixSuggester(Version matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars)
+ {
+
+ if (minPrefixChars < 0)
+ {
+ throw new System.ArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars);
+ }
+
+ this.queryAnalyzer = queryAnalyzer;
+ this.indexAnalyzer = indexAnalyzer;
+ this.matchVersion = matchVersion;
+ this.dir = dir;
+ this.minPrefixChars = minPrefixChars;
+
+ if (DirectoryReader.IndexExists(dir))
+ {
+ // Already built; open it:
+ writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GramAnalyzer, IndexWriterConfig.OpenMode.APPEND));
+ searcherMgr = new SearcherManager(writer, true, null);
+ }
+ }
+
+ /// <summary>
+ /// Override this to customize index settings, e.g. which
+ /// codec to use.
+ /// </summary>
+ protected internal virtual IndexWriterConfig GetIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode)
+ {
+ IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer);
+ iwc.Codec = new Lucene46Codec();
+ iwc.OpenMode = openMode;
+
+ // This way all merged segments will be sorted at
+ // merge time, allow for per-segment early termination
+ // when those segments are searched:
+ iwc.MergePolicy = new SortingMergePolicy(iwc.MergePolicy, SORT);
+
+ return iwc;
+ }
+
+ /// <summary>
+ /// Subclass can override to choose a specific {@link
+ /// Directory} implementation.
+ /// </summary>
+ protected internal virtual Directory GetDirectory(File path)
+ {
+ return FSDirectory.Open(path);
+ }
+
+ public override void Build(InputIterator iter)
+ {
+
+ if (searcherMgr != null)
+ {
+ searcherMgr.Dispose();
+ searcherMgr = null;
+ }
+
+ if (writer != null)
+ {
+ writer.Dispose();
+ writer = null;
+ }
+
+ AtomicReader r = null;
+ bool success = false;
+ try
+ {
+ // First pass: build a temporary normal Lucene index,
+ // just indexing the suggestions as they iterate:
+ writer = new IndexWriter(dir, GetIndexWriterConfig(matchVersion, GramAnalyzer, IndexWriterConfig.OpenMode.CREATE));
+ //long t0 = System.nanoTime();
+
+ // TODO: use threads?
+ BytesRef text;
+ while ((text = iter.Next()) != null)
+ {
+ BytesRef payload;
+ if (iter.HasPayloads)
+ {
+ payload = iter.Payload;
+ }
+ else
+ {
+ payload = null;
+ }
+
+ Add(text, iter.Contexts, iter.Weight, payload);
+ }
+
+ //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");
+
+ searcherMgr = new SearcherManager(writer, true, null);
+ success = true;
+ }
+ finally
+ {
+ if (success)
+ {
+ IOUtils.Close(r);
+ }
+ else
+ {
+ IOUtils.CloseWhileHandlingException(writer, r);
+ writer = null;
+ }
+ }
+ }
+
+ private Analyzer GramAnalyzer
+ {
+ get
+ {
+ return new AnalyzerWrapperAnonymousInnerClassHelper(this, Analyzer.PER_FIELD_REUSE_STRATEGY);
+ }
+ }
+
+ private class AnalyzerWrapperAnonymousInnerClassHelper : AnalyzerWrapper
+ {
+ private readonly AnalyzingInfixSuggester outerInstance;
+
+ public AnalyzerWrapperAnonymousInnerClassHelper(AnalyzingInfixSuggester outerInstance, UnknownType PER_FIELD_REUSE_STRATEGY)
+ : base(PER_FIELD_REUSE_STRATEGY)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ protected override Analyzer GetWrappedAnalyzer(string fieldName)
+ {
+ return outerInstance.indexAnalyzer;
+ }
+
+ protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
+ {
+ if (fieldName.Equals("textgrams") && outerInstance.minPrefixChars > 0)
+ {
+ return new TokenStreamComponents(components.Tokenizer, new EdgeNGramTokenFilter(outerInstance.matchVersion, components.TokenStream, 1, outerInstance.minPrefixChars));
+ }
+ else
+ {
+ return components;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Adds a new suggestion. Be sure to use <seealso cref="#update"/>
+ /// instead if you want to replace a previous suggestion.
+ /// After adding or updating a batch of new suggestions,
+ /// you must call <seealso cref="#refresh"/> in the end in order to
+ /// see the suggestions in <seealso cref="#lookup"/>
+ /// </summary>
+ public virtual void Add(BytesRef text, HashSet<BytesRef> contexts, long weight, BytesRef payload)
+ {
+ writer.AddDocument(BuildDocument(text, contexts, weight, payload));
+ }
+
+ /// <summary>
+ /// Updates a previous suggestion, matching the exact same
+ /// text as before. Use this to change the weight or
+ /// payload of an already added suggstion. If you know
+ /// this text is not already present you can use {@link
+ /// #add} instead. After adding or updating a batch of
+ /// new suggestions, you must call <seealso cref="#refresh"/> in the
+ /// end in order to see the suggestions in <seealso cref="#lookup"/>
+ /// </summary>
+ public virtual void Update(BytesRef text, HashSet<BytesRef> contexts, long weight, BytesRef payload)
+ {
+ writer.UpdateDocument(new Term(EXACT_TEXT_FIELD_NAME, text.Utf8ToString()), BuildDocument(text, contexts, weight, payload));
+ }
+
+ private Document BuildDocument(BytesRef text, HashSet<BytesRef> contexts, long weight, BytesRef payload)
+ {
+ string textString = text.Utf8ToString();
+ var ft = TextFieldType;
+ var doc = new Document
+ {
+ new Field(TEXT_FIELD_NAME, textString, ft),
+ new Field("textgrams", textString, ft),
+ new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO),
+ new BinaryDocValuesField(TEXT_FIELD_NAME, text),
+ new NumericDocValuesField("weight", weight)
+ };
+ if (payload != null)
+ {
+ doc.Add(new BinaryDocValuesField("payloads", payload));
+ }
+ if (contexts != null)
+ {
+ foreach (BytesRef context in contexts)
+ {
+ // TODO: if we had a BinaryTermField we could fix
+ // this "must be valid ut8f" limitation:
+ doc.Add(new StringField(CONTEXTS_FIELD_NAME, context.Utf8ToString(), Field.Store.NO));
+ doc.Add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context));
+ }
+ }
+ return doc;
+ }
+
+ /// <summary>
+ /// Reopens the underlying searcher; it's best to "batch
+ /// up" many additions/updates, and then call refresh
+ /// once in the end.
+ /// </summary>
+ public virtual void Refresh()
+ {
+ searcherMgr.MaybeRefreshBlocking();
+ }
+
+ /// <summary>
+ /// Subclass can override this method to change the field type of the text field
+ /// e.g. to change the index options
+ /// </summary>
+ protected internal virtual FieldType TextFieldType
+ {
+ get
+ {
+ var ft = new FieldType(TextField.TYPE_NOT_STORED);
+ ft.IndexOptions = FieldInfo.IndexOptions.DOCS_ONLY;
+ ft.OmitNorms = true;
+
+ return ft;
+ }
+ }
+
+ public override IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num)
+ {
+ return Lookup(key, contexts, num, true, true);
+ }
+
+ /// <summary>
+ /// Lookup, without any context.
+ /// </summary>
+ public virtual IList<LookupResult> Lookup(string key, int num, bool allTermsRequired, bool doHighlight)
+ {
+ return Lookup(key, null, num, allTermsRequired, doHighlight);
+ }
+
+ /// <summary>
+ /// This is called if the last token isn't ended
+ /// (e.g. user did not type a space after it). Return an
+ /// appropriate Query clause to add to the BooleanQuery.
+ /// </summary>
+ protected internal virtual Query GetLastTokenQuery(string token)
+ {
+ if (token.Length < minPrefixChars)
+ {
+ // The leading ngram was directly indexed:
+ return new TermQuery(new Term("textgrams", token));
+ }
+
+ return new PrefixQuery(new Term(TEXT_FIELD_NAME, token));
+ }
+
+ /// <summary>
+ /// Retrieve suggestions, specifying whether all terms
+ /// must match ({@code allTermsRequired}) and whether the hits
+ /// should be highlighted ({@code doHighlight}).
+ /// </summary>
+ public virtual IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight)
+ {
+
+ if (searcherMgr == null)
+ {
+ throw new InvalidOperationException("suggester was not built");
+ }
+
+ BooleanClause.Occur occur;
+ if (allTermsRequired)
+ {
+ occur = BooleanClause.Occur.MUST;
+ }
+ else
+ {
+ occur = BooleanClause.Occur.SHOULD;
+ }
+
+ TokenStream ts = null;
+ BooleanQuery query;
+ var matchedTokens = new HashSet<string>();
+ string prefixToken = null;
+
+ try
+ {
+ ts = queryAnalyzer.TokenStream("", new StringReader(key));
+
+ //long t0 = System.currentTimeMillis();
+ ts.Reset();
+ var termAtt = ts.AddAttribute<CharTermAttribute>();
+ var offsetAtt = ts.AddAttribute<OffsetAttribute>();
+ string lastToken = null;
+ query = new BooleanQuery();
+ int maxEndOffset = -1;
+ matchedTokens = new HashSet<string>();
+ while (ts.IncrementToken())
+ {
+ if (lastToken != null)
+ {
+ matchedTokens.Add(lastToken);
+ query.Add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
+ }
+ lastToken = termAtt.ToString();
+ if (lastToken != null)
+ {
+ maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
+ }
+ }
+ ts.End();
+
+ if (lastToken != null)
+ {
+ Query lastQuery;
+ if (maxEndOffset == offsetAtt.EndOffset())
+ {
+ // Use PrefixQuery (or the ngram equivalent) when
+ // there was no trailing discarded chars in the
+ // string (e.g. whitespace), so that if query does
+ // not end with a space we show prefix matches for
+ // that token:
+ lastQuery = GetLastTokenQuery(lastToken);
+ prefixToken = lastToken;
+ }
+ else
+ {
+ // Use TermQuery for an exact match if there were
+ // trailing discarded chars (e.g. whitespace), so
+ // that if query ends with a space we only show
+ // exact matches for that term:
+ matchedTokens.Add(lastToken);
+ lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
+ }
+ if (lastQuery != null)
+ {
+ query.Add(lastQuery, occur);
+ }
+ }
+
+ if (contexts != null)
+ {
+ BooleanQuery sub = new BooleanQuery();
+ query.Add(sub, BooleanClause.Occur.MUST);
+ foreach (BytesRef context in contexts)
+ {
+ // NOTE: we "should" wrap this in
+ // ConstantScoreQuery, or maybe send this as a
+ // Filter instead to search, but since all of
+ // these are MUST'd, the change to the score won't
+ // affect the overall ranking. Since we indexed
+ // as DOCS_ONLY, the perf should be the same
+ // either way (no freq int[] blocks to decode):
+
+ // TODO: if we had a BinaryTermField we could fix
+ // this "must be valid ut8f" limitation:
+ sub.Add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context.Utf8ToString())), BooleanClause.Occur.SHOULD);
+ }
+ }
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ts);
+ }
+
+ // TODO: we could allow blended sort here, combining
+ // weight w/ score. Now we ignore score and sort only
+ // by weight:
+
+ Query finalQuery = FinishQuery(query, allTermsRequired);
+
+ //System.out.println("finalQuery=" + query);
+
+ // Sort by weight, descending:
+ TopFieldCollector c = TopFieldCollector.Create(SORT, num, true, false, false, false);
+
+ // We sorted postings by weight during indexing, so we
+ // only retrieve the first num hits now:
+ Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num);
+ IndexSearcher searcher = searcherMgr.Acquire();
+ IList<LookupResult> results = null;
+ try
+ {
+ //System.out.println("got searcher=" + searcher);
+ searcher.Search(finalQuery, c2);
+
+ TopFieldDocs hits = (TopFieldDocs)c.TopDocs();
+
+ // Slower way if postings are not pre-sorted by weight:
+ // hits = searcher.search(query, null, num, SORT);
+ results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
+ }
+ finally
+ {
+ searcherMgr.Release(searcher);
+ }
+
+ //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
+ //System.out.println(results);
+
+ return results;
+ }
+
+ /// <summary>
+ /// Create the results based on the search hits.
+ /// Can be overridden by subclass to add particular behavior (e.g. weight transformation) </summary>
+ /// <exception cref="IOException"> If there are problems reading fields from the underlying Lucene index. </exception>
+ protected internal virtual IList<LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num, string charSequence, bool doHighlight, HashSet<string> matchedTokens, string prefixToken)
+ {
+
+ BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME);
+
+ // This will just be null if app didn't pass payloads to build():
+ // TODO: maybe just stored fields? they compress...
+ BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads");
+ IList<AtomicReaderContext> leaves = searcher.IndexReader.Leaves();
+ IList<LookupResult> results = new List<LookupResult>();
+ BytesRef scratch = new BytesRef();
+ for (int i = 0; i < hits.ScoreDocs.Length; i++)
+ {
+ FieldDoc fd = (FieldDoc)hits.ScoreDocs[i];
+ textDV.Get(fd.Doc, scratch);
+ string text = scratch.Utf8ToString();
+ long score = (long?)fd.Fields[0];
+
+ BytesRef payload;
+ if (payloadsDV != null)
+ {
+ payload = new BytesRef();
+ payloadsDV.Get(fd.Doc, payload);
+ }
+ else
+ {
+ payload = null;
+ }
+
+ // Must look up sorted-set by segment:
+ int segment = ReaderUtil.SubIndex(fd.Doc, leaves);
+ SortedSetDocValues contextsDV = leaves[segment].Reader().GetSortedSetDocValues(CONTEXTS_FIELD_NAME);
+ HashSet<BytesRef> contexts;
+ if (contextsDV != null)
+ {
+ contexts = new HashSet<BytesRef>();
+ contextsDV.Document = fd.Doc - leaves[segment].DocBase;
+ long ord;
+ while ((ord = contextsDV.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS)
+ {
+ BytesRef context = new BytesRef();
+ contextsDV.LookupOrd(ord, context);
+ contexts.Add(context);
+ }
+ }
+ else
+ {
+ contexts = null;
+ }
+
+ LookupResult result;
+
+ if (doHighlight)
+ {
+ object highlightKey = Highlight(text, matchedTokens, prefixToken);
+ result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload, contexts);
+ }
+ else
+ {
+ result = new LookupResult(text, score, payload, contexts);
+ }
+
+ results.Add(result);
+ }
+
+ return results;
+ }
+
+ /// <summary>
+ /// Subclass can override this to tweak the Query before
+ /// searching.
+ /// </summary>
+ protected internal virtual Query FinishQuery(BooleanQuery bq, bool allTermsRequired)
+ {
+ return bq;
+ }
+
+ /// <summary>
+ /// Override this method to customize the Object
+ /// representing a single highlighted suggestions; the
+ /// result is set on each {@link
+ /// LookupResult#highlightKey} member.
+ /// </summary>
+ protected internal virtual object Highlight(string text, HashSet<string> matchedTokens, string prefixToken)
+ {
+ TokenStream ts = queryAnalyzer.TokenStream("text", new StringReader(text));
+ try
+ {
+ var termAtt = ts.AddAttribute<CharTermAttribute>();
+ var offsetAtt = ts.AddAttribute<OffsetAttribute>();
+ ts.Reset();
+ var sb = new StringBuilder();
+ int upto = 0;
+ while (ts.IncrementToken())
+ {
+ string token = termAtt.ToString();
+ int startOffset = offsetAtt.StartOffset();
+ int endOffset = offsetAtt.EndOffset();
+ if (upto < startOffset)
+ {
+ addNonMatch(sb, text.Substring(upto, startOffset - upto));
+ upto = startOffset;
+ }
+ else if (upto > startOffset)
+ {
+ continue;
+ }
+
+ if (matchedTokens.Contains(token))
+ {
+ // Token matches.
+ AddWholeMatch(sb, text.Substring(startOffset, endOffset - startOffset), token);
+ upto = endOffset;
+ }
+ else if (prefixToken != null && token.StartsWith(prefixToken, StringComparison.Ordinal))
+ {
+ AddPrefixMatch(sb, text.Substring(startOffset, endOffset - startOffset), token, prefixToken);
+ upto = endOffset;
+ }
+ }
+ ts.End();
+ int endOffset = offsetAtt.EndOffset();
+ if (upto < endOffset)
+ {
+ addNonMatch(sb, text.Substring(upto));
+ }
+ return sb.ToString();
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ts);
+ }
+ }
+
+ /// <summary>
+ /// Called while highlighting a single result, to append a
+ /// non-matching chunk of text from the suggestion to the
+ /// provided fragments list. </summary>
+ /// <param name="sb"> The {@code StringBuilder} to append to </param>
+ /// <param name="text"> The text chunk to add </param>
+ protected internal virtual void addNonMatch(StringBuilder sb, string text)
+ {
+ sb.Append(text);
+ }
+
+ /// <summary>
+ /// Called while highlighting a single result, to append
+ /// the whole matched token to the provided fragments list. </summary>
+ /// <param name="sb"> The {@code StringBuilder} to append to </param>
+ /// <param name="surface"> The surface form (original) text </param>
+ /// <param name="analyzed"> The analyzed token corresponding to the surface form text </param>
+ protected internal virtual void AddWholeMatch(StringBuilder sb, string surface, string analyzed)
+ {
+ sb.Append("<b>");
+ sb.Append(surface);
+ sb.Append("</b>");
+ }
+
+ /// <summary>
+ /// Called while highlighting a single result, to append a
+ /// matched prefix token, to the provided fragments list. </summary>
+ /// <param name="sb"> The {@code StringBuilder} to append to </param>
+ /// <param name="surface"> The fragment of the surface form
+ /// (indexed during <seealso cref="#build"/>, corresponding to
+ /// this match </param>
+ /// <param name="analyzed"> The analyzed token that matched </param>
+ /// <param name="prefixToken"> The prefix of the token that matched </param>
+ protected internal virtual void AddPrefixMatch(StringBuilder sb, string surface, string analyzed, string prefixToken)
+ {
+ // TODO: apps can try to invert their analysis logic
+ // here, e.g. downcase the two before checking prefix:
+ sb.Append("<b>");
+ sb.Append(surface.Substring(0, prefixToken.Length));
+ sb.Append("</b>");
+ if (prefixToken.Length < surface.Length)
+ {
+ sb.Append(surface.Substring(prefixToken.Length));
+ }
+ }
+
+ public override bool Store(DataOutput @in)
+ {
+ return false;
+ }
+
+ public override bool Load(DataInput @out)
+ {
+ return false;
+ }
+
+ public void Dispose()
+ {
+ if (searcherMgr != null)
+ {
+ searcherMgr.Dispose();
+ searcherMgr = null;
+ }
+ if (writer != null)
+ {
+ writer.Dispose();
+ dir.Dispose();
+ writer = null;
+ }
+ }
+
+ public override long SizeInBytes()
+ {
+ long mem = RamUsageEstimator.ShallowSizeOf(this);
+ try
+ {
+ if (searcherMgr != null)
+ {
+ IndexSearcher searcher = searcherMgr.Acquire();
+ try
+ {
+ foreach (AtomicReaderContext context in searcher.IndexReader.Leaves())
+ {
+ AtomicReader reader = FilterAtomicReader.Unwrap(context.Reader());
+ if (reader is SegmentReader)
+ {
+ mem += ((SegmentReader)context.reader()).RamBytesUsed();
+ }
+ }
+ }
+ finally
+ {
+ searcherMgr.Release(searcher);
+ }
+ }
+ return mem;
+ }
+ catch (IOException ioe)
+ {
+ throw new Exception(ioe);
+ }
+ }
+
+ public override long Count
+ {
+ get
+ {
+ IndexSearcher searcher = searcherMgr.Acquire();
+ try
+ {
+ return searcher.IndexReader.NumDocs();
+ }
+ finally
+ {
+ searcherMgr.Release(searcher);
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs
new file mode 100644
index 0000000..885b992
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/AnalyzingSuggester.cs
@@ -0,0 +1,1093 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using Lucene.Net.Analysis;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Suggester that first analyzes the surface form, adds the
+ /// analyzed form to a weighted FST, and then does the same
+ /// thing at lookup time. This means lookup is based on the
+ /// analyzed form while suggestions are still the surface
+ /// form(s).
+ ///
+ /// <para>
+ /// This can result in powerful suggester functionality. For
+ /// example, if you use an analyzer removing stop words,
+ /// then the partial text "ghost chr..." could see the
+ /// suggestion "The Ghost of Christmas Past". Note that
+ /// position increments MUST NOT be preserved for this example
+ /// to work, so you should call the constructor with
+ /// <code>preservePositionIncrements</code> parameter set to
+ /// false
+ ///
+ /// </para>
+ /// <para>
+ /// If SynonymFilter is used to map wifi and wireless network to
+ /// hotspot then the partial text "wirele..." could suggest
+ /// "wifi router". Token normalization like stemmers, accent
+ /// removal, etc., would allow suggestions to ignore such
+ /// variations.
+ ///
+ /// </para>
+ /// <para>
+ /// When two matching suggestions have the same weight, they
+ /// are tie-broken by the analyzed form. If their analyzed
+ /// form is the same then the order is undefined.
+ ///
+ /// </para>
+ /// <para>
+ /// There are some limitations:
+ /// <ul>
+ ///
+ /// <li> A lookup from a query like "net" in English won't
+ /// be any different than "net " (ie, user added a
+ /// trailing space) because analyzers don't reflect
+ /// when they've seen a token separator and when they
+ /// haven't.
+ ///
+ /// <li> If you're using {@code StopFilter}, and the user will
+ /// type "fast apple", but so far all they've typed is
+ /// "fast a", again because the analyzer doesn't convey whether
+ /// it's seen a token separator after the "a",
+ /// {@code StopFilter} will remove that "a" causing
+ /// far more matches than you'd expect.
+ ///
+ /// <li> Lookups with the empty string return no results
+ /// instead of all results.
+ /// </ul>
+ ///
+ /// @lucene.experimental
+ /// </para>
+ /// </summary>
+ public class AnalyzingSuggester : Lookup
+ {
+
+ /// <summary>
+ /// FST<Weight,Surface>:
+ /// input is the analyzed form, with a null byte between terms
+ /// weights are encoded as costs: (Integer.MAX_VALUE-weight)
+ /// surface is the original, unanalyzed form.
+ /// </summary>
+ private FST<PairOutputs.Pair<long?, BytesRef>> fst = null;
+
+ /// <summary>
+ /// Analyzer that will be used for analyzing suggestions at
+ /// index time.
+ /// </summary>
+ private readonly Analyzer indexAnalyzer;
+
+ /// <summary>
+ /// Analyzer that will be used for analyzing suggestions at
+ /// query time.
+ /// </summary>
+ private readonly Analyzer queryAnalyzer;
+
+ /// <summary>
+ /// True if exact match suggestions should always be returned first.
+ /// </summary>
+ private readonly bool exactFirst;
+
+ /// <summary>
+ /// True if separator between tokens should be preserved.
+ /// </summary>
+ private readonly bool preserveSep;
+
+ /// <summary>
+ /// Include this flag in the options parameter to {@link
+ /// #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)} to always
+ /// return the exact match first, regardless of score. This
+ /// has no performance impact but could result in
+ /// low-quality suggestions.
+ /// </summary>
+ public const int EXACT_FIRST = 1;
+
+ /// <summary>
+ /// Include this flag in the options parameter to {@link
+ /// #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)} to preserve
+ /// token separators when matching.
+ /// </summary>
+ public const int PRESERVE_SEP = 2;
+
+ /// <summary>
+ /// Represents the separation between tokens, if
+ /// PRESERVE_SEP was specified
+ /// </summary>
+ private const int SEP_LABEL = '\u001F';
+
+ /// <summary>
+ /// Marks end of the analyzed input and start of dedup
+ /// byte.
+ /// </summary>
+ private const int END_BYTE = 0x0;
+
+ /// <summary>
+ /// Maximum number of dup surface forms (different surface
+ /// forms for the same analyzed form).
+ /// </summary>
+ private readonly int maxSurfaceFormsPerAnalyzedForm;
+
+ /// <summary>
+ /// Maximum graph paths to index for a single analyzed
+ /// surface form. This only matters if your analyzer
+ /// makes lots of alternate paths (e.g. contains
+ /// SynonymFilter).
+ /// </summary>
+ private readonly int maxGraphExpansions;
+
+ /// <summary>
+ /// Highest number of analyzed paths we saw for any single
+ /// input surface form. For analyzers that never create
+ /// graphs this will always be 1.
+ /// </summary>
+ private int maxAnalyzedPathsForOneInput;
+
+ private bool hasPayloads;
+
+ private const int PAYLOAD_SEP = '\u001f';
+
+ /// <summary>
+ /// Whether position holes should appear in the automaton. </summary>
+ private bool preservePositionIncrements;
+
+ /// <summary>
+ /// Number of entries the lookup was built with </summary>
+ private long count = 0;
+
+ /// <summary>
+ /// Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)
+ /// AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
+ /// PRESERVE_SEP, 256, -1, true)}
+ /// </summary>
+ public AnalyzingSuggester(Analyzer analyzer)
+ : this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)
+ {
+ }
+
+ /// <summary>
+ /// Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean)
+ /// AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
+ /// PRESERVE_SEP, 256, -1, true)}
+ /// </summary>
+ public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer)
+ : this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true)
+ {
+ }
+
+ /// <summary>
+ /// Creates a new suggester.
+ /// </summary>
+ /// <param name="indexAnalyzer"> Analyzer that will be used for
+ /// analyzing suggestions while building the index. </param>
+ /// <param name="queryAnalyzer"> Analyzer that will be used for
+ /// analyzing query text during lookup </param>
+ /// <param name="options"> see <seealso cref="#EXACT_FIRST"/>, <seealso cref="#PRESERVE_SEP"/> </param>
+ /// <param name="maxSurfaceFormsPerAnalyzedForm"> Maximum number of
+ /// surface forms to keep for a single analyzed form.
+ /// When there are too many surface forms we discard the
+ /// lowest weighted ones. </param>
+ /// <param name="maxGraphExpansions"> Maximum number of graph paths
+ /// to expand from the analyzed form. Set this to -1 for
+ /// no limit. </param>
+ /// <param name="preservePositionIncrements"> Whether position holes
+ /// should appear in the automata </param>
+ public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options,
+ int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, bool preservePositionIncrements)
+ {
+ this.indexAnalyzer = indexAnalyzer;
+ this.queryAnalyzer = queryAnalyzer;
+ if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0)
+ {
+ throw new System.ArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " +
+ options);
+ }
+ this.exactFirst = (options & EXACT_FIRST) != 0;
+ this.preserveSep = (options & PRESERVE_SEP) != 0;
+
+ // NOTE: this is just an implementation limitation; if
+ // somehow this is a problem we could fix it by using
+ // more than one byte to disambiguate ... but 256 seems
+ // like it should be way more then enough.
+ if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256)
+ {
+ throw new System.ArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " +
+ maxSurfaceFormsPerAnalyzedForm + ")");
+ }
+ this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
+
+ if (maxGraphExpansions < 1 && maxGraphExpansions != -1)
+ {
+ throw new System.ArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " +
+ maxGraphExpansions + ")");
+ }
+ this.maxGraphExpansions = maxGraphExpansions;
+ this.preservePositionIncrements = preservePositionIncrements;
+ }
+
+ /// <summary>
+ /// Returns byte size of the underlying FST. </summary>
+ public override long SizeInBytes()
+ {
+ return fst == null ? 0 : fst.SizeInBytes();
+ }
+
+ private void copyDestTransitions(State from, State to, IList<Transition> transitions)
+ {
+ if (to.Accept)
+ {
+ from.Accept = true;
+ }
+ foreach (Transition t in to.Transitions)
+ {
+ transitions.Add(t);
+ }
+ }
+
+ // Replaces SEP with epsilon or remaps them if
+ // we were asked to preserve them:
+ private void ReplaceSep(Automaton a)
+ {
+
+ State[] states = a.NumberedStates;
+
+ // Go in reverse topo sort so we know we only have to
+ // make one pass:
+ for (int stateNumber = states.Length - 1; stateNumber >= 0; stateNumber--)
+ {
+ State state = states[stateNumber];
+ IList<Transition> newTransitions = new List<Transition>();
+ foreach (Transition t in state.Transitions)
+ {
+ Debug.Assert(t.Min == t.Max);
+ if (t.Min == TokenStreamToAutomaton.POS_SEP)
+ {
+ if (preserveSep)
+ {
+ // Remap to SEP_LABEL:
+ newTransitions.Add(new Transition(SEP_LABEL, t.Dest));
+ }
+ else
+ {
+ copyDestTransitions(state, t.Dest, newTransitions);
+ a.Deterministic = false;
+ }
+ }
+ else if (t.Min == TokenStreamToAutomaton.HOLE)
+ {
+
+ // Just remove the hole: there will then be two
+ // SEP tokens next to each other, which will only
+ // match another hole at search time. Note that
+ // it will also match an empty-string token ... if
+ // that's somehow a problem we can always map HOLE
+ // to a dedicated byte (and escape it in the
+ // input).
+ copyDestTransitions(state, t.Dest, newTransitions);
+ a.Deterministic = false;
+ }
+ else
+ {
+ newTransitions.Add(t);
+ }
+ }
+ state.Transitions = newTransitions.ToArray();
+ }
+ }
+
+ /// <summary>
+ /// Used by subclass to change the lookup automaton, if
+ /// necessary.
+ /// </summary>
+ protected internal virtual Automaton convertAutomaton(Automaton a)
+ {
+ return a;
+ }
+
+ internal virtual TokenStreamToAutomaton TokenStreamToAutomaton
+ {
+ get
+ {
+ var tsta = new TokenStreamToAutomaton();
+ tsta.PreservePositionIncrements = preservePositionIncrements;
+ return tsta;
+ }
+ }
+
+ private class AnalyzingComparator : IComparer<BytesRef>
+ {
+
+ internal readonly bool hasPayloads;
+
+ public AnalyzingComparator(bool hasPayloads)
+ {
+ this.hasPayloads = hasPayloads;
+ }
+
+ internal readonly ByteArrayDataInput readerA = new ByteArrayDataInput();
+ internal readonly ByteArrayDataInput readerB = new ByteArrayDataInput();
+ internal readonly BytesRef scratchA = new BytesRef();
+ internal readonly BytesRef scratchB = new BytesRef();
+
+ public virtual int Compare(BytesRef a, BytesRef b)
+ {
+
+ // First by analyzed form:
+ readerA.Reset(a.Bytes, a.Offset, a.Length);
+ scratchA.Length = readerA.ReadShort();
+ scratchA.Bytes = a.Bytes;
+ scratchA.Offset = readerA.Position;
+
+ readerB.Reset(b.Bytes, b.Offset, b.Length);
+ scratchB.Bytes = b.Bytes;
+ scratchB.Length = readerB.ReadShort();
+ scratchB.Offset = readerB.Position;
+
+ int cmp = scratchA.CompareTo(scratchB);
+ if (cmp != 0)
+ {
+ return cmp;
+ }
+ readerA.SkipBytes(scratchA.Length);
+ readerB.SkipBytes(scratchB.Length);
+
+ // Next by cost:
+ long aCost = readerA.ReadInt();
+ long bCost = readerB.ReadInt();
+ Debug.Assert(DecodeWeight(aCost) >= 0);
+ Debug.Assert(DecodeWeight(bCost) >= 0);
+ if (aCost < bCost)
+ {
+ return -1;
+ }
+ else if (aCost > bCost)
+ {
+ return 1;
+ }
+
+ // Finally by surface form:
+ if (hasPayloads)
+ {
+ scratchA.Length = readerA.ReadShort();
+ scratchB.Length = readerB.ReadShort();
+ scratchA.Offset = readerA.Position;
+ scratchB.Offset = readerB.Position;
+ }
+ else
+ {
+ scratchA.Offset = readerA.Position;
+ scratchB.Offset = readerB.Position;
+ scratchA.Length = a.Length - scratchA.Offset;
+ scratchB.Length = b.Length - scratchB.Offset;
+ }
+
+ return scratchA.CompareTo(scratchB);
+ }
+ }
+
+ public override void Build(InputIterator iterator)
+ {
+ if (iterator.HasContexts)
+ {
+ throw new System.ArgumentException("this suggester doesn't support contexts");
+ }
+ string prefix = this.GetType().Name;
+ var directory = OfflineSorter.DefaultTempDir();
+ var tempInput = File.CreateTempFile(prefix, ".input", directory);
+ var tempSorted = File.CreateTempFile(prefix, ".sorted", directory);
+
+ hasPayloads = iterator.HasPayloads;
+
+ var writer = new OfflineSorter.ByteSequencesWriter(tempInput);
+ OfflineSorter.ByteSequencesReader reader = null;
+ var scratch = new BytesRef();
+
+ TokenStreamToAutomaton ts2a = TokenStreamToAutomaton;
+
+ bool success = false;
+ count = 0;
+ sbyte[] buffer = new sbyte[8];
+ try
+ {
+ ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
+ BytesRef surfaceForm;
+
+ while ((surfaceForm = iterator.Next()) != null)
+ {
+ HashSet<IntsRef> paths = ToFiniteStrings(surfaceForm, ts2a);
+
+ maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);
+
+ foreach (IntsRef path in paths)
+ {
+
+ Util.Fst.Util.ToBytesRef(path, scratch);
+
+ // length of the analyzed text (FST input)
+ if (scratch.Length > short.MaxValue - 2)
+ {
+ throw new System.ArgumentException("cannot handle analyzed forms > " + (short.MaxValue - 2) +
+ " in length (got " + scratch.length + ")");
+ }
+ short analyzedLength = (short) scratch.Length;
+
+ // compute the required length:
+ // analyzed sequence + weight (4) + surface + analyzedLength (short)
+ int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;
+
+ BytesRef payload;
+
+ if (hasPayloads)
+ {
+ if (surfaceForm.Length > (short.MaxValue - 2))
+ {
+ throw new ArgumentException("cannot handle surface form > " + (short.MaxValue - 2) +
+ " in length (got " + surfaceForm.Length + ")");
+ }
+ payload = iterator.Payload;
+ // payload + surfaceLength (short)
+ requiredLength += payload.Length + 2;
+ }
+ else
+ {
+ payload = null;
+ }
+
+ buffer = ArrayUtil.Grow(buffer, requiredLength);
+
+ output.Reset(buffer);
+
+ output.WriteShort(analyzedLength);
+
+ output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);
+
+ output.WriteInt(EncodeWeight(iterator.Weight));
+
+ if (hasPayloads)
+ {
+ for (int i = 0; i < surfaceForm.Length; i++)
+ {
+ if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
+ {
+ throw new ArgumentException(
+ "surface form cannot contain unit separator character U+001F; this character is reserved");
+ }
+ }
+ output.WriteShort((short) surfaceForm.Length);
+ output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
+ output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
+ }
+ else
+ {
+ output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
+ }
+
+ Debug.Assert(output.Position == requiredLength, output.Position + " vs " + requiredLength);
+
+ writer.Write(buffer, 0, output.Position);
+ }
+ count++;
+ }
+ writer.Dispose();
+
+ // Sort all input/output pairs (required by FST.Builder):
+ (new OfflineSorter(new AnalyzingComparator(hasPayloads))).Sort(tempInput, tempSorted);
+
+ // Free disk space:
+ tempInput.Delete();
+
+ reader = new OfflineSorter.ByteSequencesReader(tempSorted);
+
+ PairOutputs<long?, BytesRef> outputs = new PairOutputs<long?, BytesRef>(PositiveIntOutputs.Singleton,
+ ByteSequenceOutputs.Singleton);
+ Builder<PairOutputs.Pair<long?, BytesRef>> builder =
+ new Builder<PairOutputs.Pair<long?, BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ // Build FST:
+ BytesRef previousAnalyzed = null;
+ BytesRef analyzed = new BytesRef();
+ BytesRef surface = new BytesRef();
+ IntsRef scratchInts = new IntsRef();
+ var input = new ByteArrayDataInput();
+
+ // Used to remove duplicate surface forms (but we
+ // still index the hightest-weight one). We clear
+ // this when we see a new analyzed form, so it cannot
+ // grow unbounded (at most 256 entries):
+ var seenSurfaceForms = new HashSet<BytesRef>();
+
+ var dedup = 0;
+ while (reader.Read(scratch))
+ {
+ input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
+ short analyzedLength = input.ReadShort();
+ analyzed.Grow(analyzedLength + 2);
+ input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
+ analyzed.Length = analyzedLength;
+
+ long cost = input.ReadInt();
+
+ surface.Bytes = scratch.Bytes;
+ if (hasPayloads)
+ {
+ surface.Length = input.ReadShort();
+ surface.Offset = input.Position;
+ }
+ else
+ {
+ surface.Offset = input.Position;
+ surface.Length = scratch.Length - surface.Offset;
+ }
+
+ if (previousAnalyzed == null)
+ {
+ previousAnalyzed = new BytesRef();
+ previousAnalyzed.CopyBytes(analyzed);
+ seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
+ }
+ else if (analyzed.Equals(previousAnalyzed))
+ {
+ dedup++;
+ if (dedup >= maxSurfaceFormsPerAnalyzedForm)
+ {
+ // More than maxSurfaceFormsPerAnalyzedForm
+ // dups: skip the rest:
+ continue;
+ }
+ if (seenSurfaceForms.Contains(surface))
+ {
+ continue;
+ }
+ seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
+ }
+ else
+ {
+ dedup = 0;
+ previousAnalyzed.CopyBytes(analyzed);
+ seenSurfaceForms.Clear();
+ seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
+ }
+
+ // TODO: I think we can avoid the extra 2 bytes when
+ // there is no dup (dedup==0), but we'd have to fix
+ // the exactFirst logic ... which would be sort of
+ // hairy because we'd need to special case the two
+ // (dup/not dup)...
+
+ // NOTE: must be byte 0 so we sort before whatever
+ // is next
+ analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0;
+ analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (sbyte) dedup;
+ analyzed.Length += 2;
+
+ Util.Fst.Util.ToIntsRef(analyzed, scratchInts);
+ //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
+ if (!hasPayloads)
+ {
+ builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
+ }
+ else
+ {
+ int payloadOffset = input.Position + surface.Length;
+ int payloadLength = scratch.Length - payloadOffset;
+ BytesRef br = new BytesRef(surface.Length + 1 + payloadLength);
+ Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
+ br.Bytes[surface.Length] = PAYLOAD_SEP;
+ Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
+ br.Length = br.Bytes.Length;
+ builder.Add(scratchInts, outputs.NewPair(cost, br));
+ }
+ }
+ fst = builder.Finish();
+
+ //Util.dotToFile(fst, "/tmp/suggest.dot");
+
+ success = true;
+ }
+ finally
+ {
+ if (success)
+ {
+ IOUtils.Close(reader, writer);
+ }
+ else
+ {
+ IOUtils.CloseWhileHandlingException(reader, writer);
+ }
+
+ tempInput.Delete();
+ tempSorted.Delete();
+ }
+ }
+
+ public override bool Store(DataOutput output)
+ {
+ output.WriteVLong(count);
+ if (fst == null)
+ {
+ return false;
+ }
+
+ fst.Save(output);
+ output.WriteVInt(maxAnalyzedPathsForOneInput);
+ output.WriteByte((sbyte) (hasPayloads ? 1 : 0));
+ return true;
+ }
+
+ public override bool Load(DataInput input)
+ {
+ count = input.ReadVLong();
+ this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.Singleton, ByteSequenceOutputs.Singleton));
+ maxAnalyzedPathsForOneInput = input.ReadVInt();
+ hasPayloads = input.ReadByte() == 1;
+ return true;
+ }
+
+ private LookupResult GetLookupResult(long? output1, BytesRef output2, CharsRef spare)
+ {
+ LookupResult result;
+ if (hasPayloads)
+ {
+ int sepIndex = -1;
+ for (int i = 0; i < output2.Length; i++)
+ {
+ if (output2.Bytes[output2.Offset + i] == PAYLOAD_SEP)
+ {
+ sepIndex = i;
+ break;
+ }
+ }
+ Debug.Assert(sepIndex != -1);
+ spare.Grow(sepIndex);
+
+ int payloadLen = output2.Length - sepIndex - 1;
+ UnicodeUtil.UTF8toUTF16(output2.Bytes, output2.Offset, sepIndex, spare);
+ BytesRef payload = new BytesRef(payloadLen);
+ Array.Copy(output2.Bytes, sepIndex + 1, payload.Bytes, 0, payloadLen);
+ payload.Length = payloadLen;
+ result = new LookupResult(spare.ToString(), DecodeWeight(output1.Value), payload);
+ }
+ else
+ {
+ spare.Grow(output2.Length);
+ UnicodeUtil.UTF8toUTF16(output2, spare);
+ result = new LookupResult(spare.ToString(), DecodeWeight(output1.Value));
+ }
+
+ return result;
+ }
+
+ private bool SameSurfaceForm(BytesRef key, BytesRef output2)
+ {
+ if (hasPayloads)
+ {
+ // output2 has at least PAYLOAD_SEP byte:
+ if (key.Length >= output2.Length)
+ {
+ return false;
+ }
+ for (int i = 0; i < key.Length; i++)
+ {
+ if (key.Bytes[key.Offset + i] != output2.Bytes[output2.Offset + i])
+ {
+ return false;
+ }
+ }
+ return output2.Bytes[output2.Offset + key.Length] == PAYLOAD_SEP;
+ }
+ else
+ {
+ return key.BytesEquals(output2);
+ }
+ }
+
+ public override IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num)
+ {
+ Debug.Assert(num > 0);
+
+ if (onlyMorePopular)
+ {
+ throw new System.ArgumentException("this suggester only works with onlyMorePopular=false");
+ }
+ if (contexts != null)
+ {
+ throw new System.ArgumentException("this suggester doesn't support contexts");
+ }
+ if (fst == null)
+ {
+ return Collections.emptyList();
+ }
+
+ //System.out.println("lookup key=" + key + " num=" + num);
+ for (var i = 0; i < key.Length; i++)
+ {
+ if (key[i] == 0x1E)
+ {
+ throw new ArgumentException(
+ "lookup key cannot contain HOLE character U+001E; this character is reserved");
+ }
+ if (key[i] == 0x1F)
+ {
+ throw new ArgumentException(
+ "lookup key cannot contain unit separator character U+001F; this character is reserved");
+ }
+ }
+
+ var utf8Key = new BytesRef(key);
+ try
+ {
+
+ Automaton lookupAutomaton = ToLookupAutomaton(key);
+
+ var spare = new CharsRef();
+
+ //System.out.println(" now intersect exactFirst=" + exactFirst);
+
+ // Intersect automaton w/ suggest wFST and get all
+ // prefix starting nodes & their outputs:
+ //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
+
+ //System.out.println(" prefixPaths: " + prefixPaths.size());
+
+ FST.BytesReader bytesReader = fst.BytesReader;
+
+ FST.Arc<PairOutputs.Pair<long?, BytesRef>> scratchArc = new FST.Arc<PairOutputs.Pair<long?, BytesRef>>();
+
+ IList<LookupResult> results = new List<LookupResult>();
+
+ IList<FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>>> prefixPaths =
+ FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
+
+ if (exactFirst)
+ {
+
+ int count = 0;
+ foreach (FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>> path in prefixPaths)
+ {
+ if (fst.FindTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null)
+ {
+ // This node has END_BYTE arc leaving, meaning it's an
+ // "exact" match:
+ count++;
+ }
+ }
+
+ // Searcher just to find the single exact only
+ // match, if present:
+ Util.TopNSearcher<PairOutputs.Pair<long?, BytesRef>> searcher;
+ searcher = new Util.TopNSearcher<>(fst, count*maxSurfaceFormsPerAnalyzedForm,
+ count*maxSurfaceFormsPerAnalyzedForm, weightComparator);
+
+ // NOTE: we could almost get away with only using
+ // the first start node. The only catch is if
+ // maxSurfaceFormsPerAnalyzedForm had kicked in and
+ // pruned our exact match from one of these nodes
+ // ...:
+ foreach (FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>> path in prefixPaths)
+ {
+ if (fst.FindTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null)
+ {
+ // This node has END_BYTE arc leaving, meaning it's an
+ // "exact" match:
+ searcher.AddStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.Output), false,
+ path.input);
+ }
+ }
+
+ Util.TopResults<PairOutputs.Pair<long?, BytesRef>> completions = searcher.Search();
+ Debug.Assert(completions.IsComplete);
+
+ // NOTE: this is rather inefficient: we enumerate
+ // every matching "exactly the same analyzed form"
+ // path, and then do linear scan to see if one of
+ // these exactly matches the input. It should be
+ // possible (though hairy) to do something similar
+ // to getByOutput, since the surface form is encoded
+ // into the FST output, so we more efficiently hone
+ // in on the exact surface-form match. Still, I
+ // suspect very little time is spent in this linear
+ // seach: it's bounded by how many prefix start
+ // nodes we have and the
+ // maxSurfaceFormsPerAnalyzedForm:
+ foreach (Util.Result<PairOutputs.Pair<long?, BytesRef>> completion in completions)
+ {
+ BytesRef output2 = completion.Output.output2;
+ if (SameSurfaceForm(utf8Key, output2))
+ {
+ results.Add(GetLookupResult(completion.Output.output1, output2, spare));
+ break;
+ }
+ }
+
+ if (results.Count == num)
+ {
+ // That was quick:
+ return results;
+ }
+ }
+
+ Util.TopNSearcher<PairOutputs.Pair<long?, BytesRef>> searcher;
+ searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num - results.Count,
+ num*maxAnalyzedPathsForOneInput, weightComparator, utf8Key, results);
+
+ prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
+
+ foreach (FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>> path in prefixPaths)
+ {
+ searcher.AddStartPaths(path.fstNode, path.output, true, path.input);
+ }
+
+ Util.TopResults<PairOutputs.Pair<long?, BytesRef>> completions = searcher.search();
+ Debug.Assert(completions.IsComplete);
+
+ foreach (Util.Result<PairOutputs.Pair<long?, BytesRef>> completion in completions)
+ {
+
+ LookupResult result = GetLookupResult(completion.Output.output1, completion.Output.output2, spare);
+
+ // TODO: for fuzzy case would be nice to return
+ // how many edits were required
+
+ //System.out.println(" result=" + result);
+ results.Add(result);
+
+ if (results.Count == num)
+ {
+ // In the exactFirst=true case the search may
+ // produce one extra path
+ break;
+ }
+ }
+
+ return results;
+ }
+ catch (IOException bogus)
+ {
+ throw new Exception(bogus);
+ }
+ }
+
+ private class TopNSearcherAnonymousInnerClassHelper : Util.TopNSearcher<PairOutputs.Pair<long?, BytesRef>>
+ {
+ private readonly AnalyzingSuggester outerInstance;
+
+ private BytesRef utf8Key;
+ private IList<LookupResult> results;
+
+ public TopNSearcherAnonymousInnerClassHelper<T1>
+ (
+ private AnalyzingSuggester outerInstance, FST
+ <
+ private T1
+ >
+ private org.apache.lucene.search.suggest.fst
+ ,
+ private UnknownType size,
+ private int num, UnknownType
+ private weightComparator
+ ,
+ private BytesRef utf8Key, IList
+ <
+ private LookupResult
+ >
+ private results
+ ) : base(
+ private org.apache.lucene.search.suggest.fst
+ ,
+ private size
+ ,
+ num* outerInstance.maxAnalyzedPathsForOneInput
+ ,
+ private weightComparator
+ )
+ {
+ this.outerInstance = outerInstance;
+ this.utf8Key = utf8Key;
+ this.results = results;
+ seen = new HashSet<>();
+ }
+
+ private readonly HashSet<BytesRef> seen;
+
+ protected internal override bool AcceptResult(IntsRef input, PairOutputs.Pair<long?, BytesRef> output)
+ {
+
+ // Dedup: when the input analyzes to a graph we
+ // can get duplicate surface forms:
+ if (seen.Contains(output.Output2))
+ {
+ return false;
+ }
+ seen.Add(output.Output2);
+
+ if (!outerInstance.exactFirst)
+ {
+ return true;
+ }
+ else
+ {
+ // In exactFirst mode, don't accept any paths
+ // matching the surface form since that will
+ // create duplicate results:
+ if (outerInstance.sameSurfaceForm(utf8Key, output.Output2))
+ {
+ // We found exact match, which means we should
+ // have already found it in the first search:
+ Debug.Assert(results.Count == 1);
+ return false;
+ }
+ else
+ {
+ return true;
+ }
+ }
+ }
+ }
+
+ public override long Count
+ {
+ get { return count; }
+ }
+
+ /// <summary>
+ /// Returns all prefix paths to initialize the search.
+ /// </summary>
+ protected internal virtual IList<FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>>> GetFullPrefixPaths(
+ IList<FSTUtil.Path<PairOutputs.Pair<long?, BytesRef>>> prefixPaths, Automaton lookupAutomaton,
+ FST<PairOutputs.Pair<long?, BytesRef>> fst)
+ {
+ return prefixPaths;
+ }
+
+ internal HashSet<IntsRef> ToFiniteStrings(BytesRef surfaceForm, TokenStreamToAutomaton ts2a)
+ {
+ // Analyze surface form:
+ Automaton automaton = null;
+ TokenStream ts = indexAnalyzer.TokenStream("", surfaceForm.Utf8ToString());
+ try
+ {
+
+ // Create corresponding automaton: labels are bytes
+ // from each analyzed token, with byte 0 used as
+ // separator between tokens:
+ automaton = ts2a.ToAutomaton(ts);
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ts);
+ }
+
+ ReplaceSep(automaton);
+ automaton = convertAutomaton(automaton);
+
+ Debug.Assert(SpecialOperations.IsFinite(automaton));
+
+ // Get all paths from the automaton (there can be
+ // more than one path, eg if the analyzer created a
+ // graph using SynFilter or WDF):
+
+ // TODO: we could walk & add simultaneously, so we
+ // don't have to alloc [possibly biggish]
+ // intermediate HashSet in RAM:
+ return SpecialOperations.GetFiniteStrings(automaton, maxGraphExpansions);
+ }
+
+ internal Automaton ToLookupAutomaton(string key)
+ {
+ // TODO: is there a Reader from a CharSequence?
+ // Turn tokenstream into automaton:
+ Automaton automaton = null;
+ TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());
+ try
+ {
+ automaton = (TokenStreamToAutomaton).ToAutomaton(ts);
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ts);
+ }
+
+ // TODO: we could use the end offset to "guess"
+ // whether the final token was a partial token; this
+ // would only be a heuristic ... but maybe an OK one.
+ // This way we could eg differentiate "net" from "net ",
+ // which we can't today...
+
+ ReplaceSep(automaton);
+
+ // TODO: we can optimize this somewhat by determinizing
+ // while we convert
+ BasicOperations.Determinize(automaton);
+ return automaton;
+ }
+
+
+
+ /// <summary>
+ /// Returns the weight associated with an input string,
+ /// or null if it does not exist.
+ /// </summary>
+ public virtual object Get(string key)
+ {
+ throw new NotSupportedException();
+ }
+
+ /// <summary>
+ /// cost -> weight </summary>
+ private static int DecodeWeight(long encoded)
+ {
+ return (int) (int.MaxValue - encoded);
+ }
+
+ /// <summary>
+ /// weight -> cost </summary>
+ private static int EncodeWeight(long value)
+ {
+ if (value < 0 || value > int.MaxValue)
+ {
+ throw new System.NotSupportedException("cannot encode value: " + value);
+ }
+ return int.MaxValue - (int) value;
+ }
+
+ internal static readonly IComparer<PairOutputs.Pair<long?, BytesRef>> weightComparator =
+ new ComparatorAnonymousInnerClassHelper();
+
+ private class ComparatorAnonymousInnerClassHelper : IComparer<PairOutputs.Pair<long?, BytesRef>>
+ {
+ public ComparatorAnonymousInnerClassHelper()
+ {
+ }
+
+ public virtual int Compare(PairOutputs.Pair<long?, BytesRef> left, PairOutputs.Pair<long?, BytesRef> right)
+ {
+ return left.Output1.CompareTo(right.Output1);
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs
new file mode 100644
index 0000000..df82828
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/BlendedInfixSuggester.cs
@@ -0,0 +1,316 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+using Version = System.Version;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ // TODO:
+ // - allow to use the search score
+
+ /// <summary>
+ /// Extension of the AnalyzingInfixSuggester which transforms the weight
+ /// after search to take into account the position of the searched term into
+ /// the indexed text.
+ /// Please note that it increases the number of elements searched and applies the
+ /// ponderation after. It might be costly for long suggestions.
+ ///
+ /// @lucene.experimental
+ /// </summary>
+ public class BlendedInfixSuggester : AnalyzingInfixSuggester
+ {
+
+ /// <summary>
+ /// Coefficient used for linear blending
+ /// </summary>
+ protected internal static double LINEAR_COEF = 0.10;
+
+ /// <summary>
+ /// Default factor
+ /// </summary>
+ public static int DEFAULT_NUM_FACTOR = 10;
+
+ /// <summary>
+ /// Factor to multiply the number of searched elements
+ /// </summary>
+ private readonly int numFactor;
+
+ /// <summary>
+ /// Type of blender used by the suggester
+ /// </summary>
+ private readonly BlenderType blenderType;
+
+ /// <summary>
+ /// The different types of blender.
+ /// </summary>
+ public enum BlenderType
+ {
+ /// <summary>
+ /// Application dependent; override {@link
+ /// #calculateCoefficient} to compute it.
+ /// </summary>
+ CUSTOM,
+ /// <summary>
+ /// weight*(1 - 0.10*position) </summary>
+ POSITION_LINEAR,
+ /// <summary>
+ /// weight/(1+position) </summary>
+ POSITION_RECIPROCAL,
+ // TODO:
+ //SCORE
+ }
+
+ /// <summary>
+ /// Create a new instance, loading from a previously built
+ /// directory, if it exists.
+ /// </summary>
+ public BlendedInfixSuggester(Version matchVersion, Directory dir, Analyzer analyzer)
+ : base(matchVersion, dir, analyzer)
+ {
+ this.blenderType = BlenderType.POSITION_LINEAR;
+ this.numFactor = DEFAULT_NUM_FACTOR;
+ }
+
+ /// <summary>
+ /// Create a new instance, loading from a previously built
+ /// directory, if it exists.
+ /// </summary>
+ /// <param name="blenderType"> Type of blending strategy, see BlenderType for more precisions </param>
+ /// <param name="numFactor"> Factor to multiply the number of searched elements before ponderate </param>
+ /// <exception cref="IOException"> If there are problems opening the underlying Lucene index. </exception>
+ public BlendedInfixSuggester(Version matchVersion, Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, BlenderType blenderType, int numFactor)
+ : base(matchVersion, dir, indexAnalyzer, queryAnalyzer, minPrefixChars)
+ {
+ this.blenderType = blenderType;
+ this.numFactor = numFactor;
+ }
+
+ public override IList<Lookup.LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num)
+ {
+ // here we multiply the number of searched element by the defined factor
+ return base.Lookup(key, contexts, onlyMorePopular, num * numFactor);
+ }
+
+ public IList<Lookup.LookupResult> Lookup(string key, HashSet<BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight)
+ {
+ // here we multiply the number of searched element by the defined factor
+ return base.Lookup(key, contexts, num * numFactor, allTermsRequired, doHighlight);
+ }
+
+ protected internal override FieldType TextFieldType
+ {
+ get
+ {
+ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+ ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
+ ft.StoreTermVectors = true;
+ ft.StoreTermVectorPositions = true;
+ ft.OmitNorms = true;
+
+ return ft;
+ }
+ }
+
+ protected internal override IList<Lookup.LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits,
+ int num, string key, bool doHighlight, HashSet<string> matchedTokens, string prefixToken)
+ {
+
+ BinaryDocValues textDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, TEXT_FIELD_NAME);
+ Debug.Assert(textDV != null);
+
+ // This will just be null if app didn't pass payloads to build():
+ // TODO: maybe just stored fields? they compress...
+ BinaryDocValues payloadsDV = MultiDocValues.GetBinaryValues(searcher.IndexReader, "payloads");
+
+ SortedSet<Lookup.LookupResult> results = new SortedSet<Lookup.LookupResult>(LOOKUP_COMP);
+
+ // we reduce the num to the one initially requested
+ int actualNum = num / numFactor;
+
+ BytesRef scratch = new BytesRef();
+ for (int i = 0; i < hits.ScoreDocs.Length; i++)
+ {
+ FieldDoc fd = (FieldDoc)hits.ScoreDocs[i];
+
+ textDV.Get(fd.Doc, scratch);
+ string text = scratch.Utf8ToString();
+ long weight = (long?)fd.Fields[0];
+
+ BytesRef payload;
+ if (payloadsDV != null)
+ {
+ payload = new BytesRef();
+ payloadsDV.Get(fd.Doc, payload);
+ }
+ else
+ {
+ payload = null;
+ }
+
+ double coefficient;
+ if (text.StartsWith(key.ToString(), StringComparison.Ordinal))
+ {
+ // if hit starts with the key, we don't change the score
+ coefficient = 1;
+ }
+ else
+ {
+ coefficient = CreateCoefficient(searcher, fd.Doc, matchedTokens, prefixToken);
+ }
+
+ long score = (long)(weight * coefficient);
+
+ LookupResult result;
+ if (doHighlight)
+ {
+ object highlightKey = Highlight(text, matchedTokens, prefixToken);
+ result = new LookupResult(highlightKey.ToString(), highlightKey, score, payload);
+ }
+ else
+ {
+ result = new LookupResult(text, score, payload);
+ }
+
+ BoundedTreeAdd(results, result, actualNum);
+ }
+
+ return new List<LookupResult>(results.DescendingSet());
+ }
+
+ /// <summary>
+ /// Add an element to the tree respecting a size limit
+ /// </summary>
+ /// <param name="results"> the tree to add in </param>
+ /// <param name="result"> the result we try to add </param>
+ /// <param name="num"> size limit </param>
+ private static void BoundedTreeAdd(SortedSet<Lookup.LookupResult> results, Lookup.LookupResult result, int num)
+ {
+
+ if (results.Count >= num)
+ {
+ if (results.Min.value < result.value)
+ {
+ results.PollFirst();
+ }
+ else
+ {
+ return;
+ }
+ }
+
+ results.Add(result);
+ }
+
+ /// <summary>
+ /// Create the coefficient to transform the weight.
+ /// </summary>
+ /// <param name="doc"> id of the document </param>
+ /// <param name="matchedTokens"> tokens found in the query </param>
+ /// <param name="prefixToken"> unfinished token in the query </param>
+ /// <returns> the coefficient </returns>
+ /// <exception cref="IOException"> If there are problems reading term vectors from the underlying Lucene index. </exception>
+ private double CreateCoefficient(IndexSearcher searcher, int doc, HashSet<string> matchedTokens, string prefixToken)
+ {
+
+ Terms tv = searcher.IndexReader.GetTermVector(doc, TEXT_FIELD_NAME);
+ TermsEnum it = tv.Iterator(TermsEnum.EMPTY);
+
+ int? position = int.MaxValue;
+ BytesRef term;
+ // find the closest token position
+ while ((term = it.Next()) != null)
+ {
+
+ string docTerm = term.Utf8ToString();
+
+ if (matchedTokens.Contains(docTerm) || docTerm.StartsWith(prefixToken, StringComparison.Ordinal))
+ {
+
+ DocsAndPositionsEnum docPosEnum = it.DocsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
+ docPosEnum.NextDoc();
+
+ // use the first occurrence of the term
+ int p = docPosEnum.NextPosition();
+ if (p < position)
+ {
+ position = p;
+ }
+ }
+ }
+
+ // create corresponding coefficient based on position
+ return CalculateCoefficient(position.Value);
+ }
+
+ /// <summary>
+ /// Calculate the weight coefficient based on the position of the first matching word.
+ /// Subclass should override it to adapt it to particular needs </summary>
+ /// <param name="position"> of the first matching word in text </param>
+ /// <returns> the coefficient </returns>
+ protected internal virtual double CalculateCoefficient(int position)
+ {
+
+ double coefficient;
+ switch (blenderType)
+ {
+ case BlendedInfixSuggester.BlenderType.POSITION_LINEAR:
+ coefficient = 1 - LINEAR_COEF * position;
+ break;
+
+ case BlendedInfixSuggester.BlenderType.POSITION_RECIPROCAL:
+ coefficient = 1.0 / (position + 1);
+ break;
+
+ default:
+ coefficient = 1;
+ break;
+ }
+
+ return coefficient;
+ }
+
+ private static IComparer<Lookup.LookupResult> LOOKUP_COMP = new LookUpComparator();
+
+ private class LookUpComparator : IComparer<Lookup.LookupResult>
+ {
+
+ public virtual int Compare(Lookup.LookupResult o1, Lookup.LookupResult o2)
+ {
+ // order on weight
+ if (o1.value > o2.value)
+ {
+ return 1;
+ }
+ else if (o1.value < o2.value)
+ {
+ return -1;
+ }
+
+ // otherwise on alphabetic order
+ return CHARSEQUENCE_COMPARATOR.Compare(o1.key, o2.key);
+ }
+ }
+ }
+}
\ No newline at end of file