You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by di...@apache.org on 2010/03/03 22:31:21 UTC
svn commit: r918703 [1/2] - in
/lucene/lucene.net/trunk/C#/contrib/Queries.Net: ./ Queries.Net/
Queries.Net/Properties/ Queries.Net/Similar/ Test/ Test/Properties/
Test/Similar/
Author: digy
Date: Wed Mar 3 21:31:20 2010
New Revision: 918703
URL: http://svn.apache.org/viewvc?rev=918703&view=rev
Log:
LUCENENET-347 [Contrib] Port of Queries (Initial Port)
Added:
lucene/lucene.net/trunk/C#/contrib/Queries.Net/
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThis.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThisQuery.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/SimilarityQueries.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/package.html
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Support.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/TermsFilter.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BooleanFilterTest.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BoostingQueryTest.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/DuplicateFilterTest.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/FuzzyLikeThisQueryTest.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Properties/
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Properties/AssemblyInfo.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Similar/
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Similar/TestMoreLikeThis.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/TermsFilterTest.cs
lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Test.csproj
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net.sln?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln Wed Mar 3 21:31:20 2010
@@ -0,0 +1,26 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C# Express 2008
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Queries.Net", "Queries.Net\Queries.Net.csproj", "{481CF6E3-52AF-4621-9DEB-022122079AF6}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{8685A826-9B7A-42C8-88F3-EEE6B41D6D81}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Any CPU = Debug|Any CPU
+ Release|Any CPU = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {481CF6E3-52AF-4621-9DEB-022122079AF6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {481CF6E3-52AF-4621-9DEB-022122079AF6}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {481CF6E3-52AF-4621-9DEB-022122079AF6}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {481CF6E3-52AF-4621-9DEB-022122079AF6}.Release|Any CPU.Build.0 = Release|Any CPU
+ {8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Release|Any CPU.Build.0 = Release|Any CPU
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/BooleanFilter.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs Wed Mar 3 21:31:20 2010
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search
+{
+ class BooleanFilter
+ {
+ public BooleanFilter()
+ {
+ throw new NotImplementedException("Not implemented yet.");
+ }
+ }
+}
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/BoostingQuery.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs Wed Mar 3 21:31:20 2010
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+
+namespace Lucene.Net.Search
+{
+ /// <summary>
+ /// The BoostingQuery class can be used to effectively demote results that match a given query.
+ /// Unlike the "NOT" clause, this still selects documents that contain undesirable terms,
+ /// but reduces their overall score:
+ ///
+ /// Query balancedQuery = new BoostingQuery(positiveQuery, negativeQuery, 0.01f);
+ /// In this scenario the positiveQuery contains the mandatory, desirable criteria which is used to
+ /// select all matching documents, and the negativeQuery contains the undesirable elements which
+ /// are simply used to lessen the scores. Documents that match the negativeQuery have their score
+ /// multiplied by the supplied "boost" parameter, so this should be less than 1 to achieve a
+ /// demoting effect
+ ///
+ /// This code was originally made available here: [WWW] http://marc.theaimsgroup.com/?l=lucene-user&m=108058407130459&w=2
+ /// and is documented here: http://wiki.apache.org/lucene-java/CommunityContributions
+ /// </summary>
+ public class BoostingQuery : Query
+ {
+ private float boost; // the amount to boost by
+ private Query match; // query to match
+ private Query context; // boost when matches too
+
+ public BoostingQuery(Query match, Query context, float boost)
+ {
+ this.match = match;
+ this.context = (Query)context.Clone(); // clone before boost
+ this.boost = boost;
+
+ this.context.SetBoost(0.0f); // ignore context-only matches
+ }
+
+ public override Query Rewrite(IndexReader reader)
+ {
+ BooleanQuery result = new AnonymousBooleanQuery(boost);
+
+ result.Add(match, BooleanClause.Occur.MUST);
+ result.Add(context, BooleanClause.Occur.SHOULD);
+
+ return result;
+ }
+
+ class AnonymousBooleanQuery : BooleanQuery
+ {
+ float boost;
+ public AnonymousBooleanQuery(float boost)
+ {
+ this.boost = boost;
+ }
+
+ public override Similarity GetSimilarity(Searcher searcher)
+ {
+ return new AnonymousDefaultSimilarity(boost);
+ }
+ }
+
+ class AnonymousDefaultSimilarity : DefaultSimilarity
+ {
+ float boost ;
+ public AnonymousDefaultSimilarity(float boost)
+ {
+ this.boost = boost;
+ }
+
+ public override float Coord(int overlap, int max)
+ {
+ switch (overlap)
+ {
+
+ case 1: // matched only one clause
+ return 1.0f; // use the score as-is
+
+ case 2: // matched both clauses
+ return boost; // multiply by boost
+
+ default:
+ return 0.0f;
+
+ }
+ }
+ }
+
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = 1;
+ result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(boost),0);
+ result = prime * result + ((context == null) ? 0 : context.GetHashCode());
+ result = prime * result + ((match == null) ? 0 : match.GetHashCode());
+ return result;
+ }
+
+ public override bool Equals(Object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (this.GetType() != obj.GetType())
+ return false;
+ BoostingQuery other = (BoostingQuery)obj;
+ if (BitConverter.ToInt32(BitConverter.GetBytes(boost),0) != BitConverter.ToInt32(BitConverter.GetBytes(other.boost),0) )
+ return false;
+ if (context == null)
+ {
+ if (other.context != null)
+ return false;
+ }
+ else if (!context.Equals(other.context))
+ return false;
+ if (match == null)
+ {
+ if (other.match != null)
+ return false;
+ }
+ else if (!match.Equals(other.match))
+ return false;
+ return true;
+ }
+
+ public override String ToString(String field)
+ {
+ return match.ToString(field) + "/" + context.ToString(field);
+ }
+ }
+}
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs Wed Mar 3 21:31:20 2010
@@ -0,0 +1,247 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Search;
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search
+{
+ public class DuplicateFilter : Filter
+ {
+
+ String fieldName;
+
+ /**
+ * KeepMode determines which document id to consider as the master, all others being
+ * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
+ */
+ int keepMode = KM_USE_FIRST_OCCURRENCE;
+ public static int KM_USE_FIRST_OCCURRENCE = 1;
+ public static int KM_USE_LAST_OCCURRENCE = 2;
+
+ /**
+ * "Full" processing mode starts by setting all bits to false and only setting bits
+ * for documents that contain the given field and are identified as none-duplicates.
+
+ * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
+ * given field. This approach avoids the need to read TermDocs for terms that are seen
+ * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
+ * faster approach , the downside is that bitsets produced will include bits set for
+ * documents that do not actually contain the field given.
+ *
+ */
+ int processingMode = PM_FULL_VALIDATION;
+ public static int PM_FULL_VALIDATION = 1;
+ public static int PM_FAST_INVALIDATION = 2;
+
+
+
+ public DuplicateFilter(String fieldName) : this(fieldName, KM_USE_LAST_OCCURRENCE, PM_FULL_VALIDATION)
+ {
+ }
+
+
+ public DuplicateFilter(String fieldName, int keepMode, int processingMode)
+ {
+ this.fieldName = fieldName;
+ this.keepMode = keepMode;
+ this.processingMode = processingMode;
+ }
+
+ public override DocIdSet GetDocIdSet(IndexReader reader)
+ {
+ if (processingMode == PM_FAST_INVALIDATION)
+ {
+ return FastBits(reader);
+ }
+ else
+ {
+ return CorrectBits(reader);
+ }
+ }
+
+ private OpenBitSet CorrectBits(IndexReader reader)
+ {
+
+ OpenBitSet bits = new OpenBitSet(reader.MaxDoc()); //assume all are INvalid
+ Term startTerm = new Term(fieldName);
+ TermEnum te = reader.Terms(startTerm);
+ if (te != null)
+ {
+ Term currTerm = te.Term();
+ while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned
+ {
+ int lastDoc = -1;
+ //set non duplicates
+ TermDocs td = reader.TermDocs(currTerm);
+ if (td.Next())
+ {
+ if (keepMode == KM_USE_FIRST_OCCURRENCE)
+ {
+ bits.Set(td.Doc());
+ }
+ else
+ {
+ do
+ {
+ lastDoc = td.Doc();
+ } while (td.Next());
+ bits.Set(lastDoc);
+ }
+ }
+ if (!te.Next())
+ {
+ break;
+ }
+ currTerm = te.Term();
+ }
+ }
+ return bits;
+ }
+
+ private OpenBitSet FastBits(IndexReader reader)
+ {
+
+ OpenBitSet bits = new OpenBitSet(reader.MaxDoc());
+ bits.Set(0, reader.MaxDoc()); //assume all are valid
+ Term startTerm = new Term(fieldName);
+ TermEnum te = reader.Terms(startTerm);
+ if (te != null)
+ {
+ Term currTerm = te.Term();
+
+ while ((currTerm != null) && (currTerm.Field() == startTerm.Field())) //term fieldnames are interned
+ {
+ if (te.DocFreq() > 1)
+ {
+ int lastDoc = -1;
+ //unset potential duplicates
+ TermDocs td = reader.TermDocs(currTerm);
+ td.Next();
+ if (keepMode == KM_USE_FIRST_OCCURRENCE)
+ {
+ td.Next();
+ }
+ do
+ {
+ lastDoc = td.Doc();
+ bits.Clear(lastDoc);
+ } while (td.Next());
+ if (keepMode == KM_USE_LAST_OCCURRENCE)
+ {
+ //restore the last bit
+ bits.Set(lastDoc);
+ }
+ }
+ if (!te.Next())
+ {
+ break;
+ }
+ currTerm = te.Term();
+ }
+ }
+ return bits;
+ }
+
+ // /**
+ // * @param args
+ // * @throws IOException
+ // * @throws Exception
+ // */
+ // public static void main(String[] args)
+ // {
+ // IndexReader r=IndexReader.open("/indexes/personCentricAnon");
+ //// IndexReader r=IndexReader.open("/indexes/enron");
+ // long start=System.currentTimeMillis();
+ //// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
+ //// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+ // DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+ //// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
+ //// df.setProcessingMode(PM_SLOW_VALIDATION);
+ // BitSet b = df.bits(r);
+ // long end=System.currentTimeMillis()-start;
+ // System.out.println(b.cardinality()+" in "+end+" ms ");
+
+ // }
+
+
+ public String GetFieldName()
+ {
+ return fieldName;
+ }
+
+
+ public void SetFieldName(String fieldName)
+ {
+ this.fieldName = fieldName;
+ }
+
+
+ public int GetKeepMode()
+ {
+ return keepMode;
+ }
+
+
+ public void SetKeepMode(int keepMode)
+ {
+ this.keepMode = keepMode;
+ }
+
+
+ public override bool Equals(Object obj)
+ {
+ if (this == obj)
+ return true;
+ if ((obj == null) || (obj.GetType()!= this.GetType()))
+ return false;
+ DuplicateFilter other = (DuplicateFilter)obj;
+ return keepMode == other.keepMode &&
+ processingMode == other.processingMode &&
+ (fieldName == other.fieldName || (fieldName != null && fieldName.Equals(other.fieldName)));
+ }
+
+
+
+ public override int GetHashCode()
+ {
+ int hash = 217;
+ hash = 31 * hash + keepMode;
+ hash = 31 * hash + processingMode;
+ hash = 31 * hash + fieldName.GetHashCode();
+ return hash;
+ }
+
+
+ public int GetProcessingMode()
+ {
+ return processingMode;
+ }
+
+
+ public void SetProcessingMode(int processingMode)
+ {
+ this.processingMode = processingMode;
+ }
+ }
+}
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/FilterClause.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs Wed Mar 3 21:31:20 2010
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search
+{
+ class FilterClause
+ {
+ public FilterClause()
+ {
+ throw new NotImplementedException("Not implemented yet.");
+ }
+ }
+}
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs Wed Mar 3 21:31:20 2010
@@ -0,0 +1,452 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Search;
+using Lucene.Net.Index;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search
+{
+ /// <summary>
+ /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
+ /// In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
+ /// of fuzzy scoring factors.
+ /// This generally produces good results for queries where users may provide details in a number of
+ /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
+ /// a fast query.
+ ///
+ /// For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
+ /// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
+ /// TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
+ /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
+ /// term) and this is factored into the variant's boost. If the source query term does not exist in the
+ /// index the average IDF of the variants is used.
+ /// </summary>
+ public class FuzzyLikeThisQuery : Query
+ {
+ static Similarity sim = new DefaultSimilarity();
+ Query rewrittenQuery = null;
+ ArrayList fieldVals = new ArrayList();
+ Analyzer analyzer;
+
+ ScoreTermQueue q;
+ int MAX_VARIANTS_PER_TERM = 50;
+ bool ignoreTF = false;
+ private int maxNumTerms;
+
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = 1;
+ result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode());
+ result = prime * result
+ + ((fieldVals == null) ? 0 : fieldVals.GetHashCode());
+ result = prime * result + (ignoreTF ? 1231 : 1237);
+ result = prime * result + maxNumTerms;
+ return result;
+ }
+
+ public override bool Equals(Object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (GetType() != obj.GetType())
+ return false;
+ FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj;
+ if (analyzer == null)
+ {
+ if (other.analyzer != null)
+ return false;
+ }
+ else if (!analyzer.Equals(other.analyzer))
+ return false;
+ if (fieldVals == null)
+ {
+ if (other.fieldVals != null)
+ return false;
+ }
+ else if (!fieldVals.EqualsToArrayList(other.fieldVals))
+ return false;
+ if (ignoreTF != other.ignoreTF)
+ return false;
+ if (maxNumTerms != other.maxNumTerms)
+ return false;
+ return true;
+ }
+
+
+ /**
+ *
+ * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
+ * @param analyzer
+ */
+ public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
+ {
+ q = new ScoreTermQueue(maxNumTerms);
+ this.analyzer = analyzer;
+ this.maxNumTerms = maxNumTerms;
+ }
+
+ class FieldVals
+ {
+ internal String queryString;
+ internal String fieldName;
+ internal float minSimilarity;
+ internal int prefixLength;
+ public FieldVals(String name, float similarity, int length, String queryString)
+ {
+ fieldName = name;
+ minSimilarity = similarity;
+ prefixLength = length;
+ this.queryString = queryString;
+ }
+
+ public override int GetHashCode()
+ {
+ int prime = 31;
+ int result = 1;
+ result = prime * result
+ + ((fieldName == null) ? 0 : fieldName.GetHashCode());
+ result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity),0);
+ result = prime * result + prefixLength;
+ result = prime * result
+ + ((queryString == null) ? 0 : queryString.GetHashCode());
+ return result;
+ }
+
+ public override bool Equals(Object obj)
+ {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (GetType() != obj.GetType())
+ return false;
+ FieldVals other = (FieldVals)obj;
+ if (fieldName == null)
+ {
+ if (other.fieldName != null)
+ return false;
+ }
+ else if (!fieldName.Equals(other.fieldName))
+ return false;
+ if (BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity), 0) != BitConverter.ToInt32(BitConverter.GetBytes(other.minSimilarity), 0))
+ //if (Float.floatToIntBits(minSimilarity) != Float.floatToIntBits(other.minSimilarity))
+ return false;
+ if (prefixLength != other.prefixLength)
+ return false;
+ if (queryString == null)
+ {
+ if (other.queryString != null)
+ return false;
+ }
+ else if (!queryString.Equals(other.queryString))
+ return false;
+ return true;
+ }
+
+
+
+ }
+
+ /**
+ * Adds user input for "fuzzification"
+ * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
+ * @param fieldName
+ * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
+ * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
+ */
+ public void AddTerms(String queryString, String fieldName, float minSimilarity, int prefixLength)
+ {
+ fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString));
+ }
+
+
+ private void AddTerms(IndexReader reader, FieldVals f)
+ {
+ if (f.queryString == null) return;
+ TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
+ TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));
+
+ int corpusNumDocs = reader.NumDocs();
+ Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
+ Hashtable processedTerms = new Hashtable();
+ while (ts.IncrementToken())
+ {
+ String term = termAtt.Term();
+ if (!processedTerms.Contains(term))
+ {
+ processedTerms.Add(term,term);
+ ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
+ float minScore = 0;
+ Term startTerm = internSavingTemplateTerm.CreateTerm(term);
+ FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
+ TermEnum origEnum = reader.Terms(startTerm);
+ int df = 0;
+ if (startTerm.Equals(origEnum.Term()))
+ {
+ df = origEnum.DocFreq(); //store the df so all variants use same idf
+ }
+ int numVariants = 0;
+ int totalVariantDocFreqs = 0;
+ do
+ {
+ Term possibleMatch = fe.Term();
+ if (possibleMatch != null)
+ {
+ numVariants++;
+ totalVariantDocFreqs += fe.DocFreq();
+ float score = fe.Difference();
+ if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
+ {
+ ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
+ variantsQ.Insert(st);
+ minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore
+ }
+ }
+ }
+ while (fe.Next());
+ if (numVariants > 0)
+ {
+ int avgDf = totalVariantDocFreqs / numVariants;
+ if (df == 0)//no direct match we can use as df for all variants
+ {
+ df = avgDf; //use avg df of all variants
+ }
+
+ // take the top variants (scored by edit distance) and reset the score
+ // to include an IDF factor then add to the global queue for ranking
+ // overall top query terms
+ int size = variantsQ.Size();
+ for (int i = 0; i < size; i++)
+ {
+ ScoreTerm st = (ScoreTerm)variantsQ.Pop();
+ st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
+ q.Insert(st);
+ }
+ }
+ }
+ }
+ }
+
+ public override Query Rewrite(IndexReader reader)
+ {
+ if (rewrittenQuery != null)
+ {
+ return rewrittenQuery;
+ }
+ //load up the list of possible terms
+ foreach (FieldVals f in fieldVals)
+ {
+ AddTerms(reader, f);
+ }
+ //for (Iterator iter = fieldVals.iterator(); iter.hasNext(); )
+ //{
+ // FieldVals f = (FieldVals)iter.next();
+ // addTerms(reader, f);
+ //}
+ //clear the list of fields
+ fieldVals.Clear();
+
+ BooleanQuery bq = new BooleanQuery();
+
+
+ //create BooleanQueries to hold the variants for each token/field pair and ensure it
+ // has no coord factor
+ //Step 1: sort the termqueries by term/field
+ Hashtable variantQueries = new Hashtable();
+ int size = q.Size();
+ for (int i = 0; i < size; i++)
+ {
+ ScoreTerm st = (ScoreTerm)q.Pop();
+ ArrayList l = (ArrayList)variantQueries[st.fuzziedSourceTerm];
+ if (l == null)
+ {
+ l = new ArrayList();
+ variantQueries.Add(st.fuzziedSourceTerm, l);
+ }
+ l.Add(st);
+ }
+ //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
+ foreach(ArrayList variants in variantQueries.Values)
+ //for (Iterator iter = variantQueries.values().iterator(); iter.hasNext(); )
+ {
+ //ArrayList variants = (ArrayList)iter.next();
+ if (variants.Count == 1)
+ {
+ //optimize where only one selected variant
+ ScoreTerm st = (ScoreTerm)variants[0];
+ TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF);
+ tq.SetBoost(st.score); // set the boost to a mix of IDF and score
+ bq.Add(tq, BooleanClause.Occur.SHOULD);
+ }
+ else
+ {
+ BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
+ foreach(ScoreTerm st in variants)
+ //for (Iterator iterator2 = variants.iterator(); iterator2.hasNext(); )
+ {
+ //ScoreTerm st = (ScoreTerm)iterator2.next();
+ TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF); // found a match
+ tq.SetBoost(st.score); // set the boost using the ScoreTerm's score
+ termVariants.Add(tq, BooleanClause.Occur.SHOULD); // add to query
+ }
+ bq.Add(termVariants, BooleanClause.Occur.SHOULD); // add to query
+ }
+ }
+ //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
+ // booleans with a minimum-should-match of NumFields-1?
+ bq.SetBoost(GetBoost());
+ this.rewrittenQuery = bq;
+ return bq;
+ }
+
+ //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
+ // term variants) then is reset with IDF for use in ranking against all other
+ // terms/fields
+ private class ScoreTerm
+ {
+ public Term term;
+ public float score;
+ internal Term fuzziedSourceTerm;
+
+ public ScoreTerm(Term term, float score, Term fuzziedSourceTerm)
+ {
+ this.term = term;
+ this.score = score;
+ this.fuzziedSourceTerm = fuzziedSourceTerm;
+ }
+ }
+
+ private class ScoreTermQueue : PriorityQueue
+ {
+ public ScoreTermQueue(int size)
+ {
+ Initialize(size);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
+ */
+ public override bool LessThan(Object a, Object b)
+ {
+ ScoreTerm termA = (ScoreTerm)a;
+ ScoreTerm termB = (ScoreTerm)b;
+ if (termA.score == termB.score)
+ return termA.term.CompareTo(termB.term) > 0;
+ else
+ return termA.score < termB.score;
+ }
+
+ }
+
+ //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
+ private class FuzzyTermQuery : TermQuery
+ {
+ bool ignoreTF;
+
+ public FuzzyTermQuery(Term t, bool ignoreTF): base(t)
+ {
+ this.ignoreTF = ignoreTF;
+ }
+
+ public override Similarity GetSimilarity(Searcher searcher)
+ {
+ Similarity result = base.GetSimilarity(searcher);
+ result = new AnonymousSimilarityDelegator(this,result);
+ return result;
+ }
+
+ class AnonymousSimilarityDelegator : SimilarityDelegator
+ {
+ FuzzyTermQuery parent = null;
+ public AnonymousSimilarityDelegator(FuzzyTermQuery parent,Similarity result) : base(result)
+ {
+ this.parent = parent;
+ }
+
+ public override float Tf(float freq)
+ {
+ if (parent.ignoreTF)
+ {
+ return 1; //ignore tf
+ }
+ return base.Tf(freq);
+ }
+
+ public override float Idf(int docFreq, int numDocs)
+ {
+ //IDF is already factored into individual term boosts
+ return 1;
+ }
+
+ public override float Coord(int overlap, int maxOverlap)
+ {
+ return base.Coord(overlap, maxOverlap);
+ }
+
+ public override float LengthNorm(string fieldName, int numTokens)
+ {
+ return base.LengthNorm(fieldName, numTokens);
+ }
+
+ public override float QueryNorm(float sumOfSquaredWeights)
+ {
+ return base.QueryNorm(sumOfSquaredWeights);
+ }
+
+ public override float SloppyFreq(int distance)
+ {
+ return base.SloppyFreq(distance);
+ }
+ }
+
+ }
+
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.Query#toString(java.lang.String)
+ */
+ public override String ToString(String field)
+ {
+ return null;
+ }
+
+
+ public bool IsIgnoreTF()
+ {
+ return ignoreTF;
+ }
+
+
+ public void SetIgnoreTF(bool ignoreTF)
+ {
+ this.ignoreTF = ignoreTF;
+ }
+
+ }
+}
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs Wed Mar 3 21:31:20 2010
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Queries.Net(Apache Lucene.Net)")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Queries.Net")]
+[assembly: AssemblyCopyright("Copyright 2006 - 2010 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2006 - 2010 The Apache Software Foundation")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible
+// to COM components. If you need to access a type in this assembly from
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("6107399b-3ded-4abc-ab60-9e41754258e1")]
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Build Number
+// Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("2.9.2")]
+[assembly: AssemblyFileVersion("2.9.2")]
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Queries.Net.csproj?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj Wed Mar 3 21:31:20 2010
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <ProductVersion>9.0.21022</ProductVersion>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>{481CF6E3-52AF-4621-9DEB-022122079AF6}</ProjectGuid>
+ <OutputType>Library</OutputType>
+ <AppDesignerFolder>Properties</AppDesignerFolder>
+ <RootNamespace>Lucene.Net.Search</RootNamespace>
+ <AssemblyName>Queries.Net</AssemblyName>
+ <TargetFrameworkVersion>v3.5</TargetFrameworkVersion>
+ <FileAlignment>512</FileAlignment>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <ErrorReport>prompt</ErrorReport>
+ <WarningLevel>4</WarningLevel>
+ </PropertyGroup>
+ <ItemGroup>
+ <Reference Include="Lucene.Net, Version=2.9.2.1, Culture=neutral, processorArchitecture=MSIL">
+ <SpecificVersion>False</SpecificVersion>
+ <HintPath>..\..\..\..\DotNet\Work for 2.9\src\Test\bin\Release\Lucene.Net.dll</HintPath>
+ </Reference>
+ <Reference Include="System" />
+ <Reference Include="System.Core">
+ <RequiredTargetFramework>3.5</RequiredTargetFramework>
+ </Reference>
+ <Reference Include="System.Xml.Linq">
+ <RequiredTargetFramework>3.5</RequiredTargetFramework>
+ </Reference>
+ <Reference Include="System.Data.DataSetExtensions">
+ <RequiredTargetFramework>3.5</RequiredTargetFramework>
+ </Reference>
+ <Reference Include="System.Data" />
+ <Reference Include="System.Xml" />
+ </ItemGroup>
+ <ItemGroup>
+ <Compile Include="BooleanFilter.cs" />
+ <Compile Include="BoostingQuery.cs" />
+ <Compile Include="DuplicateFilter.cs" />
+ <Compile Include="FilterClause.cs" />
+ <Compile Include="FuzzyLikeThisQuery.cs" />
+ <Compile Include="Similar\MoreLikeThis.cs" />
+ <Compile Include="Properties\AssemblyInfo.cs" />
+ <Compile Include="Similar\MoreLikeThisQuery.cs" />
+ <Compile Include="Similar\SimilarityQueries.cs" />
+ <Compile Include="Support.cs" />
+ <Compile Include="TermsFilter.cs" />
+ </ItemGroup>
+ <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
\ No newline at end of file