You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2016/12/10 19:35:33 UTC
[3/7] lucenenet git commit: Corrected physical directory locations of
Lucene.Net.Sandbox and Lucene.Net.Tests.Sandbox
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs b/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs
new file mode 100644
index 0000000..34da622
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs
@@ -0,0 +1,397 @@
+\ufeffusing Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Search.Similarities;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
+ /// In effect this mixes the behaviour of <see cref="FuzzyQuery"/> and MoreLikeThis but with special consideration
+ /// of fuzzy scoring factors.
+ /// This generally produces good results for queries where users may provide details in a number of
+ /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
+ /// a fast query.
+ /// <para/>
+ /// For each source term the fuzzy variants are held in a <see cref="BooleanQuery"/> with no coord factor (because
+ /// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
+ /// <see cref="TermQuery"/> is used for variants and does not use that variant term's IDF because this would favour rarer
+ /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
+ /// term) and this is factored into the variant's boost. If the source query term does not exist in the
+ /// index the average IDF of the variants is used.
+ /// </summary>
+ public class FuzzyLikeThisQuery : Query
+ {
+ // TODO: generalize this query (at least it should not reuse this static sim!
+ // a better way might be to convert this into multitermquery rewrite methods.
+ // the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq)
+ // provided to TermQuery, so that the general idea is agnostic to any scoring system...
+ internal static TFIDFSimilarity sim = new DefaultSimilarity();
+ Query rewrittenQuery = null;
+ List<FieldVals> fieldVals = new List<FieldVals>();
+ Analyzer analyzer;
+
+ ScoreTermQueue q;
+ int MAX_VARIANTS_PER_TERM = 50;
+ bool ignoreTF = false;
+ private int maxNumTerms;
+
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = base.GetHashCode();
+ result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode());
+ result = prime * result
+ + ((fieldVals == null) ? 0 : fieldVals.GetValueHashCode());
+ result = prime * result + (ignoreTF ? 1231 : 1237);
+ result = prime * result + maxNumTerms;
+ return result;
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (GetType() != obj.GetType())
+ return false;
+ if (!base.Equals(obj))
+ {
+ return false;
+ }
+ FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj;
+ if (analyzer == null)
+ {
+ if (other.analyzer != null)
+ return false;
+ }
+ else if (!analyzer.Equals(other.analyzer))
+ return false;
+ if (fieldVals == null)
+ {
+ if (other.fieldVals != null)
+ return false;
+ }
+ else if (!fieldVals.ValueEquals(other.fieldVals))
+ return false;
+ if (ignoreTF != other.ignoreTF)
+ return false;
+ if (maxNumTerms != other.maxNumTerms)
+ return false;
+ return true;
+ }
+
+ /// <summary>
+ ///
+ /// </summary>
+ /// <param name="maxNumTerms">The total number of terms clauses that will appear once rewritten as a <see cref="BooleanQuery"/></param>
+ /// <param name="analyzer"></param>
+ public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
+ {
+ q = new ScoreTermQueue(maxNumTerms);
+ this.analyzer = analyzer;
+ this.maxNumTerms = maxNumTerms;
+ }
+
+ internal class FieldVals
+ {
+ internal string queryString;
+ internal string fieldName;
+ internal float minSimilarity;
+ internal int prefixLength;
+ public FieldVals(string name, float similarity, int length, string queryString)
+ {
+ fieldName = name;
+ minSimilarity = similarity;
+ prefixLength = length;
+ this.queryString = queryString;
+ }
+
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = 1;
+ result = prime * result
+ + ((fieldName == null) ? 0 : fieldName.GetHashCode());
+ result = prime * result + Number.FloatToIntBits(minSimilarity);
+ result = prime * result + prefixLength;
+ result = prime * result
+ + ((queryString == null) ? 0 : queryString.GetHashCode());
+ return result;
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (GetType() != obj.GetType())
+ return false;
+ FieldVals other = (FieldVals)obj;
+ if (fieldName == null)
+ {
+ if (other.fieldName != null)
+ return false;
+ }
+ else if (!fieldName.Equals(other.fieldName, StringComparison.Ordinal))
+ return false;
+ if (Number.FloatToIntBits(minSimilarity) != Number
+ .FloatToIntBits(other.minSimilarity))
+ return false;
+ if (prefixLength != other.prefixLength)
+ return false;
+ if (queryString == null)
+ {
+ if (other.queryString != null)
+ return false;
+ }
+ else if (!queryString.Equals(other.queryString, StringComparison.Ordinal))
+ return false;
+ return true;
+ }
+ }
+
+ /// <summary>
+ /// Adds user input for "fuzzification"
+ /// </summary>
+ /// <param name="queryString">The string which will be parsed by the analyzer and for which fuzzy variants will be parsed</param>
+ /// <param name="fieldName">The minimum similarity of the term variants (see <see cref="FuzzyTermsEnum"/>)</param>
+ /// <param name="minSimilarity">Length of required common prefix on variant terms (see <see cref="FuzzyTermsEnum"/>)</param>
+ /// <param name="prefixLength"></param>
+ public virtual void AddTerms(string queryString, string fieldName, float minSimilarity, int prefixLength)
+ {
+ fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString));
+ }
+
+
+ private void AddTerms(IndexReader reader, FieldVals f)
+ {
+ if (f.queryString == null) return;
+ Terms terms = MultiFields.GetTerms(reader, f.fieldName);
+ if (terms == null)
+ {
+ return;
+ }
+ TokenStream ts = analyzer.TokenStream(f.fieldName, f.queryString);
+ try
+ {
+ ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>();
+
+ int corpusNumDocs = reader.NumDocs;
+ HashSet<string> processedTerms = new HashSet<string>();
+ ts.Reset();
+ while (ts.IncrementToken())
+ {
+ string term = termAtt.ToString();
+ if (!processedTerms.Contains(term))
+ {
+ processedTerms.Add(term);
+ ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
+ float minScore = 0;
+ Term startTerm = new Term(f.fieldName, term);
+ AttributeSource atts = new AttributeSource();
+ IMaxNonCompetitiveBoostAttribute maxBoostAtt =
+ atts.AddAttribute<IMaxNonCompetitiveBoostAttribute>();
+#pragma warning disable 612, 618
+ SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
+#pragma warning restore 612, 618
+ //store the df so all variants use same idf
+ int df = reader.DocFreq(startTerm);
+ int numVariants = 0;
+ int totalVariantDocFreqs = 0;
+ BytesRef possibleMatch;
+ IBoostAttribute boostAtt =
+ fe.Attributes().AddAttribute<IBoostAttribute>();
+ while ((possibleMatch = fe.Next()) != null)
+ {
+ numVariants++;
+ totalVariantDocFreqs += fe.DocFreq();
+ float score = boostAtt.Boost;
+ if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
+ {
+ ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
+ variantsQ.InsertWithOverflow(st);
+ minScore = variantsQ.Top().score; // maintain minScore
+ }
+ maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Size() >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
+ }
+
+ if (numVariants > 0)
+ {
+ int avgDf = totalVariantDocFreqs / numVariants;
+ if (df == 0)//no direct match we can use as df for all variants
+ {
+ df = avgDf; //use avg df of all variants
+ }
+
+ // take the top variants (scored by edit distance) and reset the score
+ // to include an IDF factor then add to the global queue for ranking
+ // overall top query terms
+ int size = variantsQ.Size();
+ for (int i = 0; i < size; i++)
+ {
+ ScoreTerm st = variantsQ.Pop();
+ st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
+ q.InsertWithOverflow(st);
+ }
+ }
+ }
+ }
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ts);
+ }
+ }
+
+ public override Query Rewrite(IndexReader reader)
+ {
+ if (rewrittenQuery != null)
+ {
+ return rewrittenQuery;
+ }
+ //load up the list of possible terms
+ for (IEnumerator<FieldVals> iter = fieldVals.GetEnumerator(); iter.MoveNext();)
+ {
+ FieldVals f = iter.Current;
+ AddTerms(reader, f);
+ }
+ //clear the list of fields
+ fieldVals.Clear();
+
+ BooleanQuery bq = new BooleanQuery();
+
+
+ //create BooleanQueries to hold the variants for each token/field pair and ensure it
+ // has no coord factor
+ //Step 1: sort the termqueries by term/field
+ IDictionary<Term, List<ScoreTerm>> variantQueries = new Dictionary<Term, List<ScoreTerm>>();
+ int size = q.Size();
+ for (int i = 0; i < size; i++)
+ {
+ ScoreTerm st = q.Pop();
+ //List<ScoreTerm> l = variantQueries.get(st.fuzziedSourceTerm);
+ // if(l==null)
+ List<ScoreTerm> l;
+ if (!variantQueries.TryGetValue(st.fuzziedSourceTerm, out l) || l == null)
+ {
+ l = new List<ScoreTerm>();
+ variantQueries[st.fuzziedSourceTerm] = l;
+ }
+ l.Add(st);
+ }
+ //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
+ for (var iter = variantQueries.Values.GetEnumerator(); iter.MoveNext();)
+ {
+ List<ScoreTerm> variants = iter.Current;
+ if (variants.Count == 1)
+ {
+ //optimize where only one selected variant
+ ScoreTerm st = variants[0];
+ Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1);
+ tq.Boost = st.score; // set the boost to a mix of IDF and score
+ bq.Add(tq, BooleanClause.Occur.SHOULD);
+ }
+ else
+ {
+ BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
+ for (IEnumerator<ScoreTerm> iterator2 = variants.GetEnumerator(); iterator2
+ .MoveNext();)
+ {
+ ScoreTerm st = iterator2.Current;
+ // found a match
+ Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1);
+ tq.Boost = st.score; // set the boost using the ScoreTerm's score
+ termVariants.Add(tq, BooleanClause.Occur.SHOULD); // add to query
+ }
+ bq.Add(termVariants, BooleanClause.Occur.SHOULD); // add to query
+ }
+ }
+ //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
+ // booleans with a minimum-should-match of NumFields-1?
+ bq.Boost = Boost;
+ this.rewrittenQuery = bq;
+ return bq;
+ }
+
+ //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
+ // term variants) then is reset with IDF for use in ranking against all other
+ // terms/fields
+ internal class ScoreTerm
+ {
+ public Term term;
+ public float score;
+ internal Term fuzziedSourceTerm;
+
+ public ScoreTerm(Term term, float score, Term fuzziedSourceTerm)
+ {
+ this.term = term;
+ this.score = score;
+ this.fuzziedSourceTerm = fuzziedSourceTerm;
+ }
+ }
+
+ internal class ScoreTermQueue : Util.PriorityQueue<ScoreTerm>
+ {
+ public ScoreTermQueue(int size)
+ : base(size)
+ {
+ }
+
+ /// <summary>
+ /// (non-Javadoc)
+ /// <see cref="Util.PriorityQueue{T}.LessThan(T, T)"/>
+ /// </summary>
+ public override bool LessThan(ScoreTerm termA, ScoreTerm termB)
+ {
+ if (termA.score == termB.score)
+ return termA.term.CompareTo(termB.term) > 0;
+ else
+ return termA.score < termB.score;
+ }
+
+ }
+
+ /// <summary>
+ /// (non-Javadoc)
+ /// <see cref="Query.ToString(string)"/>
+ /// </summary>
+ /// <param name="field"></param>
+ /// <returns></returns>
+ public override string ToString(string field)
+ {
+ return null;
+ }
+
+ public virtual bool IgnoreTF
+ {
+ get { return ignoreTF; }
+ set { ignoreTF = value; }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs
new file mode 100644
index 0000000..545b567
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs
@@ -0,0 +1,215 @@
+\ufeffusing Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+using System;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Implements the classic fuzzy search query. The similarity measurement
+ /// is based on the Levenshtein (edit distance) algorithm.
+ /// <para/>
+ /// Note that, unlike <see cref="FuzzyQuery"/>, this query will silently allow
+ /// for a (possibly huge) number of edit distances in comparisons, and may
+ /// be extremely slow (comparing every term in the index).
+ /// </summary>
+ [Obsolete("Use FuzzyQuery instead.")]
+ public class SlowFuzzyQuery : MultiTermQuery
+ {
+ public readonly static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
+ public readonly static int defaultPrefixLength = 0;
+ public readonly static int defaultMaxExpansions = 50;
+
+ private float minimumSimilarity;
+ private int prefixLength;
+ private bool termLongEnough = false;
+
+ protected Term term;
+
+ /// <summary>
+ /// Create a new <see cref="SlowFuzzyQuery"/> that will match terms with a similarity
+ /// of at least <paramref name="minimumSimilarity"/> to <paramref name="term"/>.
+ /// If a <paramref name="prefixLength"/> > 0 is specified, a common prefix
+ /// of that length is also required.
+ /// </summary>
+ /// <param name="term">the term to search for</param>
+ /// <param name="minimumSimilarity">
+ /// a value between 0 and 1 to set the required similarity
+ /// between the query term and the matching terms. For example, for a
+ /// <paramref name="minimumSimilarity"/> of <c>0.5</c> a term of the same length
+ /// as the query term is considered similar to the query term if the edit distance
+ /// between both terms is less than <c>length(term)*0.5</c>
+ /// <para/>
+ /// Alternatively, if <paramref name="minimumSimilarity"/> is >= 1f, it is interpreted
+ /// as a pure Levenshtein edit distance. For example, a value of <c>2f</c>
+ /// will match all terms within an edit distance of <c>2</c> from the
+ /// query term. Edit distances specified in this way may not be fractional.
+ /// </param>
+ /// <param name="prefixLength">length of common (non-fuzzy) prefix</param>
+ /// <param name="maxExpansions">
+ /// the maximum number of terms to match. If this number is
+ /// greater than <see cref="BooleanQuery.MaxClauseCount"/> when the query is rewritten,
+ /// then the maxClauseCount will be used instead.
+ /// </param>
+ /// <exception cref="ArgumentException">
+ /// if <paramref name="minimumSimilarity"/> is >= 1 or < 0
+ /// or if <paramref name="prefixLength"/> < 0
+ /// </exception>
+ public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
+ int maxExpansions)
+ : base(term.Field)
+ {
+ this.term = term;
+
+ if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
+ throw new ArgumentException("fractional edit distances are not allowed");
+ if (minimumSimilarity < 0.0f)
+ throw new ArgumentException("minimumSimilarity < 0");
+ if (prefixLength < 0)
+ throw new ArgumentException("prefixLength < 0");
+ if (maxExpansions < 0)
+ throw new ArgumentException("maxExpansions < 0");
+
+ SetRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
+
+ string text = term.Text();
+ int len = text.CodePointCount(0, text.Length);
+ if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity)))
+ {
+ this.termLongEnough = true;
+ }
+
+ this.minimumSimilarity = minimumSimilarity;
+ this.prefixLength = prefixLength;
+ }
+
+ /// <summary>
+ /// Calls <see cref="SlowFuzzyQuery(Term, float)">SlowFuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)</see>.
+ /// </summary>
+ public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength)
+ : this(term, minimumSimilarity, prefixLength, defaultMaxExpansions)
+ {
+ }
+
+ /// <summary>
+ /// Calls <see cref="SlowFuzzyQuery(Term, float)">SlowFuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)</see>.
+ /// </summary>
+ public SlowFuzzyQuery(Term term, float minimumSimilarity)
+ : this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions)
+ {
+ }
+
+ /// <summary>
+ /// Calls <see cref="SlowFuzzyQuery(Term, float)">SlowFuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)</see>.
+ /// </summary>
+ public SlowFuzzyQuery(Term term)
+ : this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions)
+ {
+ }
+
+ /// <summary>
+ /// Gets the minimum similarity that is required for this query to match.
+ /// Returns float value between 0.0 and 1.0.
+ /// </summary>
+ public virtual float MinSimilarity
+ {
+ get { return minimumSimilarity; }
+ }
+
+ /// <summary>
+ /// Gets the non-fuzzy prefix length. This is the number of characters at the start
+ /// of a term that must be identical (not fuzzy) to the query term if the query
+ /// is to match that term.
+ /// </summary>
+ public virtual int PrefixLength
+ {
+ get { return prefixLength; }
+ }
+
+ public override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts)
+ {
+ if (!termLongEnough)
+ { // can only match if it's exact
+ return new SingleTermsEnum(terms.Iterator(null), term.Bytes);
+ }
+ return new SlowFuzzyTermsEnum(terms, atts, Term, minimumSimilarity, prefixLength);
+ }
+
+ /// <summary>
+ /// Gets the pattern term.
+ /// </summary>
+ public virtual Term Term
+ {
+ get { return term; }
+ }
+
+ public override string ToString(string field)
+ {
+ StringBuilder buffer = new StringBuilder();
+ if (!term.Field.Equals(field))
+ {
+ buffer.Append(term.Field);
+ buffer.Append(":");
+ }
+ buffer.Append(term.Text());
+ buffer.Append('~');
+ buffer.Append(Number.ToString(minimumSimilarity));
+ buffer.Append(ToStringUtils.Boost(Boost));
+ return buffer.ToString();
+ }
+
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = base.GetHashCode();
+ result = prime * result + Number.FloatToIntBits(minimumSimilarity);
+ result = prime * result + prefixLength;
+ result = prime * result + ((term == null) ? 0 : term.GetHashCode());
+ return result;
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ return true;
+ if (!base.Equals(obj))
+ return false;
+ if (GetType() != obj.GetType())
+ return false;
+ SlowFuzzyQuery other = (SlowFuzzyQuery)obj;
+ if (Number.FloatToIntBits(minimumSimilarity) != Number
+ .FloatToIntBits(other.minimumSimilarity))
+ return false;
+ if (prefixLength != other.prefixLength)
+ return false;
+ if (term == null)
+ {
+ if (other.term != null)
+ return false;
+ }
+ else if (!term.Equals(other.term))
+ return false;
+ return true;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs
new file mode 100644
index 0000000..8be182c
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs
@@ -0,0 +1,293 @@
+\ufeffusing Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Potentially slow fuzzy <see cref="TermsEnum"/> for enumerating all terms that are similar
+ /// to the specified filter term.
+ /// <para/>
+ /// If the minSimilarity or maxEdits is greater than the Automaton's
+ /// allowable range, this backs off to the classic (brute force)
+ /// fuzzy terms enum method by calling <see cref="FuzzyTermsEnum.GetAutomatonEnum(int, BytesRef)"/>.
+ /// <para/>
+ /// Term enumerations are always ordered by
+ /// <see cref="FuzzyTermsEnum.Comparator"/>. Each term in the enumeration is
+ /// greater than all that precede it.
+ /// </summary>
+ [Obsolete("Use FuzzyTermsEnum instead.")]
+ public class SlowFuzzyTermsEnum : FuzzyTermsEnum
+ {
+ public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
+ float minSimilarity, int prefixLength)
+ : base(terms, atts, term, minSimilarity, prefixLength, false)
+ {
+ }
+
+ protected override void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init)
+ {
+ TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm);
+ if (newEnum != null)
+ {
+ Enum = newEnum;
+ }
+ else if (init)
+ {
+ Enum = new LinearFuzzyTermsEnum(this);
+ }
+ }
+
+ /// <summary>
+ /// Implement fuzzy enumeration with linear brute force.
+ /// </summary>
+ private class LinearFuzzyTermsEnum : FilteredTermsEnum
+ {
+ private readonly SlowFuzzyTermsEnum outerInstance;
+
+ /// <summary>
+ /// Allows us save time required to create a new array
+ /// every time similarity is called.
+ /// </summary>
+ private int[] d;
+ private int[] p;
+
+ /// <summary>this is the text, minus the prefix</summary>
+ private readonly int[] text;
+
+ private readonly IBoostAttribute boostAtt;
+
+ /// <summary>
+ /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
+ /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity >
+ /// <c>minSimilarity</c>.
+ /// <para/>
+ /// After calling the constructor the enumeration is already pointing to the first
+ /// valid term if such a term exists.
+ /// </summary>
+ /// <exception cref="IOException">If there is a low-level I/O error.</exception>
+ public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance)
+ : base(outerInstance.Terms.Iterator(null))
+ {
+ this.outerInstance = outerInstance;
+ this.boostAtt = Attributes().AddAttribute<IBoostAttribute>();
+
+ this.text = new int[outerInstance.TermLength - outerInstance.RealPrefixLength];
+ System.Array.Copy(outerInstance.TermText, outerInstance.RealPrefixLength, text, 0, text.Length);
+ string prefix = UnicodeUtil.NewString(outerInstance.TermText, 0, outerInstance.RealPrefixLength);
+ prefixBytesRef = new BytesRef(prefix);
+ this.d = new int[this.text.Length + 1];
+ this.p = new int[this.text.Length + 1];
+
+
+ InitialSeekTerm = prefixBytesRef;
+ }
+
+ private readonly BytesRef prefixBytesRef;
+ /// <summary>used for unicode conversion from BytesRef byte[] to int[]</summary>
+ private readonly IntsRef utf32 = new IntsRef(20);
+
+ /// <summary>
+ /// <para>
+ /// The termCompare method in FuzzyTermEnum uses Levenshtein distance to
+ /// calculate the distance between the given term and the comparing term.
+ /// </para>
+ /// <para>
+ /// If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison.
+ /// Otherwise, this method uses the following logic to calculate similarity.
+ /// <code>
+ /// similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+ /// </code>
+ /// where distance is the Levenshtein distance for the two words.
+ /// </para>
+ /// </summary>
+ protected override sealed AcceptStatus Accept(BytesRef term)
+ {
+ if (StringHelper.StartsWith(term, prefixBytesRef))
+ {
+ UnicodeUtil.UTF8toUTF32(term, utf32);
+ int distance = CalcDistance(utf32.Ints, outerInstance.RealPrefixLength, utf32.Length - outerInstance.RealPrefixLength);
+
+ //Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
+ if (distance == int.MinValue)
+ {
+ return AcceptStatus.NO;
+ }
+ //no need to calc similarity, if raw is true and distance > maxEdits
+ if (outerInstance.Raw == true && distance > outerInstance.MaxEdits)
+ {
+ return AcceptStatus.NO;
+ }
+ float similarity = CalcSimilarity(distance, (utf32.Length - outerInstance.RealPrefixLength), text.Length);
+
+ //if raw is true, then distance must also be <= maxEdits by now
+ //given the previous if statement
+ if (outerInstance.Raw == true ||
+ (outerInstance.Raw == false && similarity > outerInstance.MinSimilarity))
+ {
+ boostAtt.Boost = (similarity - outerInstance.MinSimilarity) * outerInstance.Scale_factor;
+ return AcceptStatus.YES;
+ }
+ else
+ {
+ return AcceptStatus.NO;
+ }
+ }
+ else
+ {
+ return AcceptStatus.END;
+ }
+ }
+
+ /******************************
+ * Compute Levenshtein distance
+ ******************************/
+
+ /// <summary>
+ /// <para>
+ /// <see cref="CalcDistance(int[], int, int)"/> returns the Levenshtein distance between the query term
+ /// and the target term.
+ /// </para>
+ /// <para>
+ /// Embedded within this algorithm is a fail-fast Levenshtein distance
+ /// algorithm. The fail-fast algorithm differs from the standard Levenshtein
+ /// distance algorithm in that it is aborted if it is discovered that the
+ /// minimum distance between the words is greater than some threshold.
+ /// </para>
+ /// <para>
+ /// Levenshtein distance (also known as edit distance) is a measure of similarity
+ /// between two strings where the distance is measured as the number of character
+ /// deletions, insertions or substitutions required to transform one string to
+ /// the other string.
+ /// </para>
+ /// </summary>
+ /// <param name="target">the target word or phrase</param>
+ /// <param name="offset">the offset at which to start the comparison</param>
+ /// <param name="length">the length of what's left of the string to compare</param>
+ /// <returns>
+ /// the number of edits or <see cref="int.MaxValue"/> if the edit distance is
+ /// greater than maxDistance.
+ /// </returns>
+ private int CalcDistance(int[] target, int offset, int length)
+ {
+ int m = length;
+ int n = text.Length;
+ if (n == 0)
+ {
+ //we don't have anything to compare. That means if we just add
+ //the letters for m we get the new word
+ return m;
+ }
+ if (m == 0)
+ {
+ return n;
+ }
+
+ int maxDistance = CalculateMaxDistance(m);
+
+ if (maxDistance < Math.Abs(m - n))
+ {
+ //just adding the characters of m to n or vice-versa results in
+ //too many edits
+ //for example "pre" length is 3 and "prefixes" length is 8. We can see that
+ //given this optimal circumstance, the edit distance cannot be less than 5.
+ //which is 8-3 or more precisely Math.abs(3-8).
+ //if our maximum edit distance is 4, then we can discard this word
+ //without looking at it.
+ return int.MinValue;
+ }
+
+ // init matrix d
+ for (int i = 0; i <= n; ++i)
+ {
+ p[i] = i;
+ }
+
+ // start computing edit distance
+ for (int j = 1; j <= m; ++j)
+ { // iterates through target
+ int bestPossibleEditDistance = m;
+ int t_j = target[offset + j - 1]; // jth character of t
+ d[0] = j;
+
+ for (int i = 1; i <= n; ++i)
+ { // iterates through text
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
+ if (t_j != text[i - 1])
+ {
+ d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
+ }
+ else
+ {
+ d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
+ }
+ bestPossibleEditDistance = Math.Min(bestPossibleEditDistance, d[i]);
+ }
+
+ //After calculating row i, the best possible edit distance
+ //can be found by found by finding the smallest value in a given column.
+ //If the bestPossibleEditDistance is greater than the max distance, abort.
+
+ if (j > maxDistance && bestPossibleEditDistance > maxDistance)
+ { //equal is okay, but not greater
+ //the closest the target can be to the text is just too far away.
+ //this target is leaving the party early.
+ return int.MinValue;
+ }
+
+ // copy current distance counts to 'previous row' distance counts: swap p and d
+ int[] _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+
+ return p[n];
+ }
+
+ private float CalcSimilarity(int edits, int m, int n)
+ {
+ // this will return less than 0.0 when the edit distance is
+ // greater than the number of characters in the shorter word.
+ // but this was the formula that was previously used in FuzzyTermEnum,
+ // so it has not been changed (even though minimumSimilarity must be
+ // greater than 0.0)
+
+ return 1.0f - ((float)edits / (float)(outerInstance.RealPrefixLength + Math.Min(n, m)));
+ }
+
+ /// <summary>
+ /// The max Distance is the maximum Levenshtein distance for the text
+ /// compared to some other value that results in score that is
+ /// better than the minimum similarity.
+ /// </summary>
+ /// <param name="m">the length of the "other value"</param>
+ /// <returns>the maximum levenshtein distance that we care about</returns>
+ private int CalculateMaxDistance(int m)
+ {
+ return outerInstance.Raw ? outerInstance.MaxEdits : Math.Min(outerInstance.MaxEdits,
+ (int)((1 - outerInstance.MinSimilarity) * (Math.Min(text.Length, m) + outerInstance.RealPrefixLength)));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs b/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs
new file mode 100644
index 0000000..cb53d81
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs
@@ -0,0 +1,384 @@
+\ufeffusing Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+using System.Diagnostics;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// SortField for <see cref="SortedSetDocValues"/>.
+ /// <para/>
+ /// A <see cref="SortedSetDocValues"/> contains multiple values for a field, so sorting with
+ /// this technique "selects" a value as the representative sort value for the document.
+ /// <para/>
+ /// By default, the minimum value in the set is selected as the sort value, but
+ /// this can be customized. Selectors other than the default do have some limitations
+ /// (see below) to ensure that all selections happen in constant-time for performance.
+ /// <para/>
+ /// Like sorting by string, this also supports sorting missing values as first or last,
+ /// via <see cref="SetMissingValue(object)"/>.
+ /// <para/>
+ /// Limitations:
+ /// <list type="bullet">
+ /// <item>
+ /// Fields containing <see cref="int.MaxValue"/> or more unique values
+ /// are unsupported.
+ /// </item>
+ /// <item>
+ /// Selectors other than the default <see cref="Selector.MIN"/> require
+ /// optional codec support. However several codecs provided by Lucene,
+ /// including the current default codec, support this.
+ /// </item>
+ /// </list>
+ /// </summary>
+ public class SortedSetSortField : SortField
+ {
+ // LUCENENET NOTE: Selector enum moved outside of this class to prevent
+ // naming conflicts.
+
+ private readonly Selector selector;
+
+ /// <summary>
+ /// Creates a sort, possibly in reverse, by the minimum value in the set
+ /// for the document.
+ /// </summary>
+ /// <param name="field">Name of field to sort by. Must not be null.</param>
+ /// <param name="reverse">True if natural order should be reversed.</param>
+ public SortedSetSortField(string field, bool reverse)
+ : this(field, reverse, Selector.MIN)
+ {
+ }
+
+ /// <summary>
+ /// Creates a sort, possibly in reverse, specifying how the sort value from
+ /// the document's set is selected.
+ /// </summary>
+ /// <param name="field">Name of field to sort by. Must not be null.</param>
+ /// <param name="reverse">True if natural order should be reversed.</param>
+ /// <param name="selector">
+ /// custom selector for choosing the sort value from the set.
+ /// <para/>
+ /// NOTE: selectors other than <see cref="Selector.MIN"/> require optional codec support.
+ /// </param>
+ public SortedSetSortField(string field, bool reverse, Selector selector)
+ : base(field, SortField.Type_e.CUSTOM, reverse)
+ {
+ // LUCENENET NOTE: Selector enum cannot be null in .NET, so we avoid this issue by not making the parameter nullable
+ //if (selector == null)
+ //{
+ // throw new NullReferenceException();
+ //}
+ this.selector = selector;
+ }
+
+ /// <summary>Returns the selector in use for this sort</summary>
+ public Selector Selector
+ {
+ get { return selector; }
+ }
+
+ public override int GetHashCode()
+ {
+ return 31 * base.GetHashCode() + selector.GetHashCode();
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj) return true;
+ if (!base.Equals(obj)) return false;
+ if (GetType() != obj.GetType()) return false;
+ SortedSetSortField other = (SortedSetSortField)obj;
+ if (selector != other.selector) return false;
+ return true;
+ }
+
+ public override string ToString()
+ {
+ StringBuilder buffer = new StringBuilder();
+ buffer.Append("<sortedset" + ": \"").Append(Field).Append("\">");
+ if (Reverse) buffer.Append('!');
+ if (missingValue != null)
+ {
+ buffer.Append(" missingValue=");
+ buffer.Append(missingValue);
+ }
+ buffer.Append(" selector=");
+ buffer.Append(selector);
+
+ return buffer.ToString();
+ }
+
+ /// <summary>
+ /// Set how missing values (the empty set) are sorted.
+ /// <para/>
+ /// Note that this must be <see cref="SortField.STRING_FIRST"/> or
+ /// <see cref="SortField.STRING_LAST"/>.
+ /// </summary>
+ public override object MissingValue
+ {
+ set
+ {
+ if (value != STRING_FIRST && value != STRING_LAST)
+ {
+ throw new ArgumentException("For SORTED_SET type, missing value must be either STRING_FIRST or STRING_LAST");
+ }
+ this.missingValue = value;
+ }
+ }
+
+ internal class TermOrdValComparatorAnonymousHelper : FieldComparator.TermOrdValComparator
+ {
+ private readonly SortedSetSortField outerInstance;
+ private readonly int numHits;
+
+ public TermOrdValComparatorAnonymousHelper(SortedSetSortField outerInstance, int numHits)
+ : base(numHits, outerInstance.Field, outerInstance.missingValue == STRING_LAST)
+ {
+ this.outerInstance = outerInstance;
+ this.numHits = numHits;
+ }
+
+ protected override SortedDocValues GetSortedDocValues(AtomicReaderContext context, string field)
+ {
+ SortedSetDocValues sortedSet = FieldCache.DEFAULT.GetDocTermOrds(context.AtomicReader, field);
+
+ if (sortedSet.ValueCount >= int.MaxValue)
+ {
+ throw new NotSupportedException("fields containing more than " + (int.MaxValue - 1) + " unique terms are unsupported");
+ }
+
+ SortedDocValues singleton = DocValues.UnwrapSingleton(sortedSet);
+ if (singleton != null)
+ {
+ // it's actually single-valued in practice, but indexed as multi-valued,
+ // so just sort on the underlying single-valued dv directly.
+ // regardless of selector type, this optimization is safe!
+ return singleton;
+ }
+ else if (outerInstance.selector == Selector.MIN)
+ {
+ return new MinValue(sortedSet);
+ }
+ else
+ {
+ if (sortedSet is RandomAccessOrds == false)
+ {
+ throw new NotSupportedException("codec does not support random access ordinals, cannot use selector: " + outerInstance.selector);
+ }
+ RandomAccessOrds randomOrds = (RandomAccessOrds)sortedSet;
+ switch (outerInstance.selector)
+ {
+ case Selector.MAX: return new MaxValue(randomOrds);
+ case Selector.MIDDLE_MIN: return new MiddleMinValue(randomOrds);
+ case Selector.MIDDLE_MAX: return new MiddleMaxValue(randomOrds);
+ case Selector.MIN:
+ default:
+ Debug.Assert(false);
+ return null;
+ }
+ }
+ }
+ }
+
+ public override FieldComparator GetComparator(int numHits, int sortPos)
+ {
+ return new TermOrdValComparatorAnonymousHelper(this, numHits);
+ }
+
+ /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the first ordinal (min)</summary>
+ internal class MinValue : SortedDocValues
+ {
+ internal readonly SortedSetDocValues @in;
+
+ internal MinValue(SortedSetDocValues @in)
+ {
+ this.@in = @in;
+ }
+
+ public override int GetOrd(int docID)
+ {
+ @in.Document = (docID);
+ return (int)@in.NextOrd();
+ }
+
+ public override void LookupOrd(int ord, BytesRef result)
+ {
+ @in.LookupOrd(ord, result);
+ }
+
+ public override int ValueCount
+ {
+ get { return (int)@in.ValueCount; }
+ }
+
+ public override int LookupTerm(BytesRef key)
+ {
+ return (int)@in.LookupTerm(key);
+ }
+ }
+
+ /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the last ordinal (max)</summary>
+ internal class MaxValue : SortedDocValues
+ {
+ internal readonly RandomAccessOrds @in;
+
+ internal MaxValue(RandomAccessOrds @in)
+ {
+ this.@in = @in;
+ }
+
+ public override int GetOrd(int docID)
+ {
+ @in.Document = (docID);
+ int count = @in.Cardinality();
+ if (count == 0)
+ {
+ return -1;
+ }
+ else
+ {
+ return (int)@in.OrdAt(count - 1);
+ }
+ }
+
+ public override void LookupOrd(int ord, BytesRef result)
+ {
+ @in.LookupOrd(ord, result);
+ }
+
+ public override int ValueCount
+ {
+ get { return (int)@in.ValueCount; }
+ }
+
+ public override int LookupTerm(BytesRef key)
+ {
+ return (int)@in.LookupTerm(key);
+ }
+ }
+
+ /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the middle ordinal (or min of the two)</summary>
+ internal class MiddleMinValue : SortedDocValues
+ {
+ internal readonly RandomAccessOrds @in;
+
+ internal MiddleMinValue(RandomAccessOrds @in)
+ {
+ this.@in = @in;
+ }
+
+ public override int GetOrd(int docID)
+ {
+ @in.Document = (docID);
+ int count = @in.Cardinality();
+ if (count == 0)
+ {
+ return -1;
+ }
+ else
+ {
+ return (int)@in.OrdAt((int)((uint)(count - 1)) >> 1);
+ }
+ }
+
+ public override void LookupOrd(int ord, BytesRef result)
+ {
+ @in.LookupOrd(ord, result);
+ }
+
+ public override int ValueCount
+ {
+ get { return (int)@in.ValueCount; }
+ }
+
+ public override int LookupTerm(BytesRef key)
+ {
+ return (int)@in.LookupTerm(key);
+ }
+ }
+
+ /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the middle ordinal (or max of the two)</summary>
+ internal class MiddleMaxValue : SortedDocValues
+ {
+ internal readonly RandomAccessOrds @in;
+
+ internal MiddleMaxValue(RandomAccessOrds @in)
+ {
+ this.@in = @in;
+ }
+
+ public override int GetOrd(int docID)
+ {
+ @in.Document = (docID);
+ int count = @in.Cardinality();
+ if (count == 0)
+ {
+ return -1;
+ }
+ else
+ {
+ return (int)@in.OrdAt((int)((uint)count >> 1));
+ }
+ }
+
+ public override void LookupOrd(int ord, BytesRef result)
+ {
+ @in.LookupOrd(ord, result);
+ }
+
+ public override int ValueCount
+ {
+ get { return (int)@in.ValueCount; }
+ }
+
+ public override int LookupTerm(BytesRef key)
+ {
+ return (int)@in.LookupTerm(key);
+ }
+ }
+ }
+
+ /// <summary>Selects a value from the document's set to use as the sort value</summary>
+ public enum Selector
+ {
+ /// <summary>
+ /// Selects the minimum value in the set
+ /// </summary>
+ MIN,
+ /// <summary>
+ /// Selects the maximum value in the set
+ /// </summary>
+ MAX,
+ /// <summary>
+ /// Selects the middle value in the set.
+ /// <para/>
+ /// If the set has an even number of values, the lower of the middle two is chosen.
+ /// </summary>
+ MIDDLE_MIN,
+ /// <summary>
+ /// Selects the middle value in the set.
+ /// <para/>
+ /// If the set has an even number of values, the higher of the middle two is chosen
+ /// </summary>
+ MIDDLE_MAX
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj b/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj
new file mode 100644
index 0000000..f0493af
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj
@@ -0,0 +1,92 @@
+\ufeff<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <ProjectGuid>{7865CBC8-2C6B-462C-ACF5-B2C4D60D93C9}</ProjectGuid>
+ <OutputType>Library</OutputType>
+ <AppDesignerFolder>Properties</AppDesignerFolder>
+ <RootNamespace>Lucene.Net.Sandbox</RootNamespace>
+ <AssemblyName>Lucene.Net.Tests.Sandbox</AssemblyName>
+ <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+ <FileAlignment>512</FileAlignment>
+ <TargetFrameworkProfile />
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <ItemGroup>
+ <Reference Include="nunit.framework, Version=2.6.3.13283, Culture=neutral, PublicKeyToken=96d09a1eb7f44a77, processorArchitecture=MSIL">
+ <HintPath>..\packages\NUnit.2.6.3\lib\nunit.framework.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="System" />
+ <Reference Include="System.Core" />
+ <Reference Include="System.Xml.Linq" />
+ <Reference Include="System.Data.DataSetExtensions" />
+ <Reference Include="Microsoft.CSharp" />
+ <Reference Include="System.Data" />
+ <Reference Include="System.Net.Http" />
+ <Reference Include="System.Xml" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="Properties\AssemblyInfo.cs" />
+ <Compile Include="Queries\DuplicateFilterTest.cs" />
+ <Compile Include="Queries\FuzzyLikeThisQueryTest.cs" />
+ <Compile Include="Queries\TestSlowFuzzyQuery.cs" />
+ <Compile Include="Queries\TestSlowFuzzyQuery2.cs" />
+ <Compile Include="Queries\TestSortedSetSortField.cs" />
+ <Compile Include="Queries\TestSortedSetSortFieldDocValues.cs" />
+ <Compile Include="Queries\TestSortedSetSortFieldSelectors.cs" />
+ </ItemGroup>
+ <ItemGroup>
+ <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+ </ItemGroup>
+ <ItemGroup>
+ <ProjectReference Include="..\Lucene.Net.Codecs\Lucene.Net.Codecs.csproj">
+ <Project>{3F79B6D4-4359-4F83-B64F-07F4F6262425}</Project>
+ <Name>Lucene.Net.Codecs</Name>
+ </ProjectReference>
+ <ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">
+ <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+ <Name>Lucene.Net</Name>
+ </ProjectReference>
+ <ProjectReference Include="..\Lucene.Net.Sandbox\Lucene.Net.Sandbox.csproj">
+ <Project>{13274ba9-9052-4354-8ffe-e3f32593368f}</Project>
+ <Name>Lucene.Net.Sandbox</Name>
+ </ProjectReference>
+ <ProjectReference Include="..\Lucene.Net.TestFramework\Lucene.Net.TestFramework.csproj">
+ <Project>{B2C0D749-CE34-4F62-A15E-00CB2FF5DDB3}</Project>
+ <Name>Lucene.Net.TestFramework</Name>
+ </ProjectReference>
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="packages.config" />
+ </ItemGroup>
+ <ItemGroup>
+ <EmbeddedResource Include="Queries\fuzzyTestData.txt" />
+ </ItemGroup>
+ <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs b/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..18e0d47
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+\ufeffusing System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Tests.Sandbox")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Lucene.Net.Tests.Sandbox")]
+[assembly: AssemblyCopyright("Copyright � 2016")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("7865cbc8-2c6b-462c-acf5-b2c4d60d93c9")]
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs b/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs
new file mode 100644
index 0000000..a0e4742
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs
@@ -0,0 +1,185 @@
+\ufeffusing Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class DuplicateFilterTest : LuceneTestCase
+ {
+ private static readonly string KEY_FIELD = "url";
+ private Directory directory;
+ private IndexReader reader;
+ TermQuery tq = new TermQuery(new Term("text", "lucene"));
+ private IndexSearcher searcher;
+
+
+ public override void SetUp()
+ {
+ base.SetUp();
+ directory = NewDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy()));
+
+ //Add series of docs with filterable fields : url, text and dates flags
+ AddDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
+ AddDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
+ AddDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
+ AddDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
+ AddDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
+ AddDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
+ AddDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
+ AddDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
+
+ // Until we fix LUCENE-2348, the index must
+ // have only 1 segment:
+ writer.ForceMerge(1);
+
+ reader = writer.Reader;
+ writer.Dispose();
+ searcher = NewSearcher(reader);
+
+ }
+
+ public override void TearDown()
+ {
+ reader.Dispose();
+ directory.Dispose();
+ base.TearDown();
+ }
+
+ private void AddDoc(RandomIndexWriter writer, string url, string text, string date)
+ {
+ Document doc = new Document();
+ doc.Add(NewStringField(KEY_FIELD, url, Field.Store.YES));
+ doc.Add(NewTextField("text", text, Field.Store.YES));
+ doc.Add(NewTextField("date", date, Field.Store.YES));
+ writer.AddDocument(doc);
+ }
+
+ [Test]
+ public void TestDefaultFilter()
+ {
+ DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+ HashSet<string> results = new HashSet<string>();
+ ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+
+ foreach (ScoreDoc hit in hits)
+ {
+ Document d = searcher.Doc(hit.Doc);
+ string url = d.Get(KEY_FIELD);
+ assertFalse("No duplicate urls should be returned", results.contains(url));
+ results.add(url);
+ }
+ }
+ [Test]
+ public void TestNoFilter()
+ {
+ HashSet<string> results = new HashSet<string>();
+ ScoreDoc[] hits = searcher.Search(tq, null, 1000).ScoreDocs;
+ assertTrue("Default searching should have found some matches", hits.Length > 0);
+ bool dupsFound = false;
+
+ foreach (ScoreDoc hit in hits)
+ {
+ Document d = searcher.Doc(hit.Doc);
+ string url = d.Get(KEY_FIELD);
+ if (!dupsFound)
+ dupsFound = results.contains(url);
+ results.add(url);
+ }
+ assertTrue("Default searching should have found duplicate urls", dupsFound);
+ }
+
+ [Test]
+ public void TestFastFilter()
+ {
+ DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+ df.ProcessingMode = (ProcessingMode.PM_FAST_INVALIDATION);
+ HashSet<string> results = new HashSet<string>();
+ ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+ assertTrue("Filtered searching should have found some matches", hits.Length > 0);
+
+ foreach (ScoreDoc hit in hits)
+ {
+ Document d = searcher.Doc(hit.Doc);
+ string url = d.Get(KEY_FIELD);
+ assertFalse("No duplicate urls should be returned", results.contains(url));
+ results.add(url);
+ }
+ assertEquals("Two urls found", 2, results.size());
+ }
+
+ [Test]
+ public void TestKeepsLastFilter()
+ {
+ DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+ df.KeepMode = (KeepMode.KM_USE_LAST_OCCURRENCE);
+ ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+ assertTrue("Filtered searching should have found some matches", hits.Length > 0);
+ foreach (ScoreDoc hit in hits)
+ {
+ Document d = searcher.Doc(hit.Doc);
+ string url = d.Get(KEY_FIELD);
+ DocsEnum td = TestUtil.Docs(Random(), reader,
+ KEY_FIELD,
+ new BytesRef(url),
+ MultiFields.GetLiveDocs(reader),
+ null,
+ 0);
+
+ int lastDoc = 0;
+ while (td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
+ {
+ lastDoc = td.DocID();
+ }
+ assertEquals("Duplicate urls should return last doc", lastDoc, hit.Doc);
+ }
+ }
+
+ [Test]
+ public void TestKeepsFirstFilter()
+ {
+ DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+ df.KeepMode = (KeepMode.KM_USE_FIRST_OCCURRENCE);
+ ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+ assertTrue("Filtered searching should have found some matches", hits.Length > 0);
+ foreach (ScoreDoc hit in hits)
+ {
+ Document d = searcher.Doc(hit.Doc);
+ string url = d.Get(KEY_FIELD);
+ DocsEnum td = TestUtil.Docs(Random(), reader,
+ KEY_FIELD,
+ new BytesRef(url),
+ MultiFields.GetLiveDocs(reader),
+ null,
+ 0);
+
+ int lastDoc = 0;
+ td.NextDoc();
+ lastDoc = td.DocID();
+ assertEquals("Duplicate urls should return first doc", lastDoc, hit.Doc);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs b/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs
new file mode 100644
index 0000000..4b830c6
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs
@@ -0,0 +1,159 @@
+\ufeffusing Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ public class FuzzyLikeThisQueryTest : LuceneTestCase
+ {
+ private Directory directory;
+ private IndexSearcher searcher;
+ private IndexReader reader;
+ private Analyzer analyzer;
+
+ public override void SetUp()
+ {
+ base.SetUp();
+
+ analyzer = new MockAnalyzer(Random());
+ directory = NewDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy()));
+
+ //Add series of docs with misspelt names
+ AddDoc(writer, "jonathon smythe", "1");
+ AddDoc(writer, "jonathan smith", "2");
+ AddDoc(writer, "johnathon smyth", "3");
+ AddDoc(writer, "johnny smith", "4");
+ AddDoc(writer, "jonny smith", "5");
+ AddDoc(writer, "johnathon smythe", "6");
+ reader = writer.Reader;
+ writer.Dispose();
+ searcher = NewSearcher(reader);
+ }
+
+ public override void TearDown()
+ {
+ reader.Dispose();
+ directory.Dispose();
+ base.TearDown();
+ }
+
+ private void AddDoc(RandomIndexWriter writer, string name, string id)
+ {
+ Document doc = new Document();
+ doc.Add(NewTextField("name", name, Field.Store.YES));
+ doc.Add(NewTextField("id", id, Field.Store.YES));
+ writer.AddDocument(doc);
+ }
+
+
+ //Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
+ [Test]
+ public void TestClosestEditDistanceMatchComesFirst()
+ {
+ FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+ flt.AddTerms("smith", "name", 0.3f, 1);
+ Query q = flt.Rewrite(searcher.IndexReader);
+ HashSet<Term> queryTerms = new HashSet<Term>();
+ q.ExtractTerms(queryTerms);
+ assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
+ assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+ assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
+ TopDocs topDocs = searcher.Search(flt, 1);
+ ScoreDoc[] sd = topDocs.ScoreDocs;
+ assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+ Document doc = searcher.Doc(sd[0].Doc);
+ assertEquals("Should match most similar not most rare variant", "2", doc.Get("id"));
+ }
+
+ //Test multiple input words are having variants produced
+ [Test]
+ public void TestMultiWord()
+ {
+ FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+ flt.AddTerms("jonathin smoth", "name", 0.3f, 1);
+ Query q = flt.Rewrite(searcher.IndexReader);
+ HashSet<Term> queryTerms = new HashSet<Term>();
+ q.ExtractTerms(queryTerms);
+ assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
+ assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+ TopDocs topDocs = searcher.Search(flt, 1);
+ ScoreDoc[] sd = topDocs.ScoreDocs;
+ assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+ Document doc = searcher.Doc(sd[0].Doc);
+ assertEquals("Should match most similar when using 2 words", "2", doc.Get("id"));
+ }
+
+ // LUCENE-4809
+ [Test]
+ public void TestNonExistingField()
+ {
+ FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+ flt.AddTerms("jonathin smoth", "name", 0.3f, 1);
+ flt.AddTerms("jonathin smoth", "this field does not exist", 0.3f, 1);
+ // don't fail here just because the field doesn't exits
+ Query q = flt.Rewrite(searcher.IndexReader);
+ HashSet<Term> queryTerms = new HashSet<Term>();
+ q.ExtractTerms(queryTerms);
+ assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
+ assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+ TopDocs topDocs = searcher.Search(flt, 1);
+ ScoreDoc[] sd = topDocs.ScoreDocs;
+ assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+ Document doc = searcher.Doc(sd[0].Doc);
+ assertEquals("Should match most similar when using 2 words", "2", doc.Get("id"));
+ }
+
+
+ //Test bug found when first query word does not match anything
+ [Test]
+ public void TestNoMatchFirstWordBug()
+ {
+ FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+ flt.AddTerms("fernando smith", "name", 0.3f, 1);
+ Query q = flt.Rewrite(searcher.IndexReader);
+ HashSet<Term> queryTerms = new HashSet<Term>();
+ q.ExtractTerms(queryTerms);
+ assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+ TopDocs topDocs = searcher.Search(flt, 1);
+ ScoreDoc[] sd = topDocs.ScoreDocs;
+ assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+ Document doc = searcher.Doc(sd[0].Doc);
+ assertEquals("Should match most similar when using 2 words", "2", doc.Get("id"));
+ }
+
+ [Test]
+ public void TestFuzzyLikeThisQueryEquals()
+ {
+ Analyzer analyzer = new MockAnalyzer(Random());
+ FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
+ fltq1.AddTerms("javi", "subject", 0.5f, 2);
+ FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
+ fltq2.AddTerms("javi", "subject", 0.5f, 2);
+ assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1,
+ fltq2);
+ }
+ }
+}