You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2011/11/23 07:55:22 UTC
[Lucene.Net] svn commit: r1205303 - in
/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries:
./ Similar/
Author: ccurrens
Date: Wed Nov 23 06:55:21 2011
New Revision: 1205303
URL: http://svn.apache.org/viewvc?rev=1205303&view=rev
Log:
ported changes to Contrib.Queries
Added:
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FileDiffs.txt
Removed:
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Support.cs
Modified:
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/BooleanFilter.cs
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Contrib.Queries.csproj
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/DuplicateFilter.cs
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FilterClause.cs
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FuzzyLikeThisQuery.cs
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThis.cs
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThisQuery.cs
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/SimilarityQueries.cs
incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/TermsFilter.cs
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/BooleanFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/BooleanFilter.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/BooleanFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/BooleanFilter.cs Wed Nov 23 06:55:21 2011
@@ -21,6 +21,7 @@ using System.Linq;
using System.Text;
using Lucene.Net.Index;
+using Lucene.Net.Support;
using Lucene.Net.Util;
namespace Lucene.Net.Search
@@ -51,7 +52,7 @@ namespace Lucene.Net.Search
/// <returns></returns>
private DocIdSetIterator GetDISI(List<Filter> filters, int index, IndexReader reader)
{
- return ((Filter)filters[index]).GetDocIdSet(reader).Iterator();
+ return filters[index].GetDocIdSet(reader).Iterator();
}
/// <summary>
@@ -73,7 +74,7 @@ namespace Lucene.Net.Search
}
else
{
- DocIdSet dis = ((Filter)shouldFilters[i]).GetDocIdSet(reader);
+ DocIdSet dis = shouldFilters[i].GetDocIdSet(reader);
if (dis is OpenBitSet)
{
// optimized case for OpenBitSets
@@ -98,7 +99,7 @@ namespace Lucene.Net.Search
}
else
{
- DocIdSet dis = ((Filter)notFilters[i]).GetDocIdSet(reader);
+ DocIdSet dis = notFilters[i].GetDocIdSet(reader);
if (dis is OpenBitSet)
{
// optimized case for OpenBitSets
@@ -122,7 +123,7 @@ namespace Lucene.Net.Search
}
else
{
- DocIdSet dis = ((Filter)mustFilters[i]).GetDocIdSet(reader);
+ DocIdSet dis = mustFilters[i].GetDocIdSet(reader);
if (dis is OpenBitSet)
{
// optimized case for OpenBitSets
@@ -139,58 +140,52 @@ namespace Lucene.Net.Search
if (res != null)
return FinalResult(res, reader.MaxDoc());
- else
- {
- //TODO: 2.- change return DocIdSet.EMPTY_DOCIDSET;
- return null;
- }
+ return DocIdSet.EMPTY_DOCIDSET;
+ }
+
+ /** Provide a SortedVIntList when it is definitely smaller
+ * than an OpenBitSet.
+ * @deprecated Either use CachingWrapperFilter, or
+ * switch to a different DocIdSet implementation yourself.
+ * This method will be removed in Lucene 4.0
+ */
+ protected DocIdSet FinalResult(OpenBitSetDISI result, int maxDocs)
+ {
+ return result;
}
/// <summary>
/// Add a filter clause.
/// </summary>
/// <param name="filterClause">The clause to add.</param>
- public void Add(BooleanFilterClause filterClause)
+ public void Add(FilterClause filterClause)
{
- if (filterClause.Occur == BooleanClause.Occur.MUST)
+ if (filterClause.GetOccur() == BooleanClause.Occur.MUST)
{
if (mustFilters == null)
{
- mustFilters = new List<Filter>();
+ mustFilters = new EquatableList<Filter>();
}
- mustFilters.Add(filterClause.Filter);
+ mustFilters.Add(filterClause.GetFilter());
}
- if (filterClause.Occur == BooleanClause.Occur.SHOULD)
+ if (filterClause.GetOccur() == BooleanClause.Occur.SHOULD)
{
if (shouldFilters == null)
{
- shouldFilters = new List<Filter>();
+ shouldFilters = new EquatableList<Filter>();
}
- shouldFilters.Add(filterClause.Filter);
+ shouldFilters.Add(filterClause.GetFilter());
}
- if (filterClause.Occur == BooleanClause.Occur.MUST_NOT)
+ if (filterClause.GetOccur() == BooleanClause.Occur.MUST_NOT)
{
if (notFilters == null)
{
- notFilters = new List<Filter>();
+ notFilters = new EquatableList<Filter>();
}
- notFilters.Add(filterClause.Filter);
+ notFilters.Add(filterClause.GetFilter());
}
}
- // TODO: in 3.0, instead of removing this deprecated
- // method, make it a no-op and mark it final
- /** Provide a SortedVIntList when it is definitely smaller
- * than an OpenBitSet.
- * @deprecated Either use CachingWrapperFilter, or
- * switch to a different DocIdSet implementation yourself. */
- protected DocIdSet FinalResult(OpenBitSetDISI result, int maxDocs)
- {
- return (result.Cardinality() < (maxDocs / 9))
- ? (DocIdSet)new SortedVIntList(result)
- : (DocIdSet)result;
- }
-
/// <summary>
/// Determine equality between two lists.
/// </summary>
@@ -283,38 +278,4 @@ namespace Lucene.Net.Search
}
}
}
-
- /// <summary>
- /// A spefic clause that makes up a part of the BooleanFilter
- /// </summary>
- public class BooleanFilterClause
- {
- /// <summary>
- /// Create a new BooleanFilterClause
- /// </summary>
- /// <param name="filter">A Filter object</param>
- /// <param name="occur">A parameter implementation indicating SHOULD, MUST or MUST NOT</param>
- public BooleanFilterClause(Filter filter, BooleanClause.Occur occur)
- {
- this.Occur = occur;
- this.Filter = filter;
- }
-
- /// <summary>
- /// The underlying filter for the clause.
- /// </summary>
- public Filter Filter
- {
- get;
- private set;
- }
- /// <summary>
- /// The occurrence of this clause.
- /// </summary>
- public BooleanClause.Occur Occur
- {
- get;
- private set;
- }
- }
}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Contrib.Queries.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Contrib.Queries.csproj?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Contrib.Queries.csproj (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Contrib.Queries.csproj Wed Nov 23 06:55:21 2011
@@ -19,7 +19,6 @@
under the License.
-->
-
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
@@ -92,7 +91,6 @@
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Similar\MoreLikeThisQuery.cs" />
<Compile Include="Similar\SimilarityQueries.cs" />
- <Compile Include="Support.cs" />
<Compile Include="TermsFilter.cs" />
</ItemGroup>
<ItemGroup>
@@ -126,6 +124,9 @@
<ItemGroup>
<None Include="Lucene.Net.snk" />
</ItemGroup>
+ <ItemGroup>
+ <Content Include="FileDiffs.txt" />
+ </ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/DuplicateFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/DuplicateFilter.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/DuplicateFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/DuplicateFilter.cs Wed Nov 23 06:55:21 2011
@@ -28,7 +28,6 @@ namespace Lucene.Net.Search
{
public class DuplicateFilter : Filter
{
-
String fieldName;
/**
@@ -82,7 +81,6 @@ namespace Lucene.Net.Search
private OpenBitSet CorrectBits(IndexReader reader)
{
-
OpenBitSet bits = new OpenBitSet(reader.MaxDoc()); //assume all are INvalid
Term startTerm = new Term(fieldName);
TermEnum te = reader.Terms(startTerm);
@@ -121,7 +119,6 @@ namespace Lucene.Net.Search
private OpenBitSet FastBits(IndexReader reader)
{
-
OpenBitSet bits = new OpenBitSet(reader.MaxDoc());
bits.Set(0, reader.MaxDoc()); //assume all are valid
Term startTerm = new Term(fieldName);
@@ -163,28 +160,6 @@ namespace Lucene.Net.Search
return bits;
}
- // /**
- // * <param name="args"></param>
- // * @throws IOException
- // * @throws Exception
- // */
- // public static void main(String[] args)
- // {
- // IndexReader r=IndexReader.open("/indexes/personCentricAnon");
- //// IndexReader r=IndexReader.open("/indexes/enron");
- // long start=System.currentTimeMillis();
- //// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
- //// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
- // DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
- //// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
- //// df.setProcessingMode(PM_SLOW_VALIDATION);
- // BitSet b = df.bits(r);
- // long end=System.currentTimeMillis()-start;
- // System.out.println(b.cardinality()+" in "+end+" ms ");
-
- // }
-
-
public String GetFieldName()
{
return fieldName;
Added: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FileDiffs.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FileDiffs.txt?rev=1205303&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FileDiffs.txt (added)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FileDiffs.txt Wed Nov 23 06:55:21 2011
@@ -0,0 +1,11 @@
+similar\MoreLikeThis.java - PORTED
+similar\MoreLikeThisQuery.java - PORTED
+similar\SimilarityQueries.java - PORTED
+BooleanFilter.java - PORTED
+BoostingQuery.java - PORTED
+DuplicateFilter.java - PORTED
+FilterClause.java - PORTED
+FuzzyLikeThisQuery.java - PORTED
+TermsFilter.java - PORTED
+
+All Files and All tests ported
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FilterClause.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FilterClause.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FilterClause.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FilterClause.cs Wed Nov 23 06:55:21 2011
@@ -22,11 +22,45 @@ using System.Text;
namespace Lucene.Net.Search
{
- class FilterClause
+ /**
+ * A Filter that wrapped with an indication of how that filter
+ * is used when composed with another filter.
+ * (Follows the boolean logic in BooleanClause for composition
+ * of queries.)
+ */
+ [Serializable]
+ public class FilterClause
{
- public FilterClause()
+ BooleanClause.Occur occur;
+ Filter filter;
+
+ /**
+ * Create a new FilterClause
+ * @param filter A Filter object containing a BitSet
+ * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
+ */
+ public FilterClause(Filter filter, BooleanClause.Occur occur)
+ {
+ this.occur = occur;
+ this.filter = filter;
+ }
+
+ /**
+ * Returns this FilterClause's filter
+ * @return A Filter object
+ */
+ public Filter GetFilter()
+ {
+ return filter;
+ }
+
+ /**
+ * Returns this FilterClause's occur parameter
+ * @return An Occur object
+ */
+ public BooleanClause.Occur GetOccur()
{
- throw new NotImplementedException("Not implemented yet.");
+ return occur;
}
}
-}
+}
\ No newline at end of file
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FuzzyLikeThisQuery.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FuzzyLikeThisQuery.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FuzzyLikeThisQuery.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/FuzzyLikeThisQuery.cs Wed Nov 23 06:55:21 2011
@@ -25,6 +25,7 @@ using Lucene.Net.Search;
using Lucene.Net.Index;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Support;
using Lucene.Net.Util;
namespace Lucene.Net.Search
@@ -48,7 +49,7 @@ namespace Lucene.Net.Search
{
static Similarity sim = new DefaultSimilarity();
Query rewrittenQuery = null;
- ArrayList fieldVals = new ArrayList();
+ EquatableList<FieldVals> fieldVals = new EquatableList<FieldVals>();
Analyzer analyzer;
ScoreTermQueue q;
@@ -89,7 +90,7 @@ namespace Lucene.Net.Search
if (other.fieldVals != null)
return false;
}
- else if (!fieldVals.EqualsToArrayList(other.fieldVals))
+ else if (!fieldVals.Equals(other.fieldVals))
return false;
if (ignoreTF != other.ignoreTF)
return false;
@@ -190,17 +191,17 @@ namespace Lucene.Net.Search
{
if (f.queryString == null) return;
TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
- TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));
+ TermAttribute termAtt = ts.AddAttribute<TermAttribute>();
int corpusNumDocs = reader.NumDocs();
Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
- Hashtable processedTerms = new Hashtable();
+ HashSet<string> processedTerms = new HashSet<string>();
while (ts.IncrementToken())
{
String term = termAtt.Term();
if (!processedTerms.Contains(term))
{
- processedTerms.Add(term,term);
+ processedTerms.Add(term);
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore = 0;
Term startTerm = internSavingTemplateTerm.CreateTerm(term);
@@ -224,8 +225,8 @@ namespace Lucene.Net.Search
if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
{
ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
- variantsQ.Insert(st);
- minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore
+ variantsQ.InsertWithOverflow(st);
+ minScore = variantsQ.Top().score; // maintain minScore
}
}
}
@@ -244,9 +245,9 @@ namespace Lucene.Net.Search
int size = variantsQ.Size();
for (int i = 0; i < size; i++)
{
- ScoreTerm st = (ScoreTerm)variantsQ.Pop();
+ ScoreTerm st = variantsQ.Pop();
st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
- q.Insert(st);
+ q.InsertWithOverflow(st);
}
}
}
@@ -264,11 +265,6 @@ namespace Lucene.Net.Search
{
AddTerms(reader, f);
}
- //for (Iterator iter = fieldVals.iterator(); iter.hasNext(); )
- //{
- // FieldVals f = (FieldVals)iter.next();
- // addTerms(reader, f);
- //}
//clear the list of fields
fieldVals.Clear();
@@ -278,28 +274,26 @@ namespace Lucene.Net.Search
//create BooleanQueries to hold the variants for each token/field pair and ensure it
// has no coord factor
//Step 1: sort the termqueries by term/field
- Hashtable variantQueries = new Hashtable();
+ HashMap<Term, List<ScoreTerm>> variantQueries = new HashMap<Term, List<ScoreTerm>>();
int size = q.Size();
for (int i = 0; i < size; i++)
{
- ScoreTerm st = (ScoreTerm)q.Pop();
- ArrayList l = (ArrayList)variantQueries[st.fuzziedSourceTerm];
+ ScoreTerm st = q.Pop();
+ var l = variantQueries[st.fuzziedSourceTerm];
if (l == null)
{
- l = new ArrayList();
+ l = new List<ScoreTerm>();
variantQueries.Add(st.fuzziedSourceTerm, l);
}
l.Add(st);
}
//Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
- foreach(ArrayList variants in variantQueries.Values)
- //for (Iterator iter = variantQueries.values().iterator(); iter.hasNext(); )
+ foreach(var variants in variantQueries.Values)
{
- //ArrayList variants = (ArrayList)iter.next();
if (variants.Count == 1)
{
//optimize where only one selected variant
- ScoreTerm st = (ScoreTerm)variants[0];
+ ScoreTerm st = variants[0];
TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF);
tq.SetBoost(st.score); // set the boost to a mix of IDF and score
bq.Add(tq, BooleanClause.Occur.SHOULD);
@@ -308,9 +302,7 @@ namespace Lucene.Net.Search
{
BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
foreach(ScoreTerm st in variants)
- //for (Iterator iterator2 = variants.iterator(); iterator2.hasNext(); )
{
- //ScoreTerm st = (ScoreTerm)iterator2.next();
TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF); // found a match
tq.SetBoost(st.score); // set the boost using the ScoreTerm's score
termVariants.Add(tq, BooleanClause.Occur.SHOULD); // add to query
@@ -342,7 +334,7 @@ namespace Lucene.Net.Search
}
}
- private class ScoreTermQueue : PriorityQueue
+ private class ScoreTermQueue : PriorityQueue<ScoreTerm>
{
public ScoreTermQueue(int size)
{
@@ -352,10 +344,8 @@ namespace Lucene.Net.Search
/* (non-Javadoc)
* <see cref="org.apache.lucene.util.PriorityQueue.lessThan(java.lang.Object, java.lang.Object)"/>
*/
- public override bool LessThan(Object a, Object b)
+ public override bool LessThan(ScoreTerm termA, ScoreTerm termB)
{
- ScoreTerm termA = (ScoreTerm)a;
- ScoreTerm termB = (ScoreTerm)b;
if (termA.score == termB.score)
return termA.term.CompareTo(termB.term) > 0;
else
@@ -403,28 +393,7 @@ namespace Lucene.Net.Search
//IDF is already factored into individual term boosts
return 1;
}
-
- public override float Coord(int overlap, int maxOverlap)
- {
- return base.Coord(overlap, maxOverlap);
- }
-
- public override float LengthNorm(string fieldName, int numTokens)
- {
- return base.LengthNorm(fieldName, numTokens);
- }
-
- public override float QueryNorm(float sumOfSquaredWeights)
- {
- return base.QueryNorm(sumOfSquaredWeights);
- }
-
- public override float SloppyFreq(int distance)
- {
- return base.SloppyFreq(distance);
- }
}
-
}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThis.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThis.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThis.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThis.cs Wed Nov 23 06:55:21 2011
@@ -16,8 +16,12 @@
*/
using System;
-
-using PriorityQueue = Lucene.Net.Util.PriorityQueue;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
using IndexReader = Lucene.Net.Index.IndexReader;
using Term = Lucene.Net.Index.Term;
using TermFreqVector = Lucene.Net.Index.TermFreqVector;
@@ -27,7 +31,6 @@ using TermQuery = Lucene.Net.Search.Term
using BooleanQuery = Lucene.Net.Search.BooleanQuery;
using IndexSearcher = Lucene.Net.Search.IndexSearcher;
using Query = Lucene.Net.Search.Query;
-using Hits = Lucene.Net.Search.Hits;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using TokenStream = Lucene.Net.Analysis.TokenStream;
using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
@@ -36,8 +39,6 @@ using Lucene.Net.Analysis.Tokenattribute
namespace Lucene.Net.Search.Similar
{
-
-
/// <summary> Generate "more like this" similarity queries.
/// Based on this mail:
/// <pre>
@@ -114,6 +115,8 @@ namespace Lucene.Net.Search.Similar
/// <ul>
/// <li> <see cref="SetMinTermFreq"/> </li>
/// <li> <see cref="SetMinDocFreq"/> </li>
+ /// <li> <see cref="SetMaxDocFreq"/></li>
+ /// <li> <see cref="SetMaxDocFreqPct"/></li>
/// <li> <see cref="SetMinWordLen"/> </li>
/// <li> <see cref="SetMaxWordLen"/></li>
/// <li> <see cref="SetMaxQueryTerms"/></li>
@@ -144,7 +147,7 @@ namespace Lucene.Net.Search.Similar
/// <summary> Default analyzer to parse source doc with.</summary>
/// <seealso cref="GetAnalyzer">
/// </seealso>
- public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
+ public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer(Util.Version.LUCENE_CURRENT);
/// <summary> Ignore terms with less than this frequency in the source doc.</summary>
/// <seealso cref="GetMinTermFreq">
@@ -158,7 +161,15 @@ namespace Lucene.Net.Search.Similar
/// </seealso>
/// <seealso cref="SetMinDocFreq">
/// </seealso>
- public const int DEFALT_MIN_DOC_FREQ = 5;
+ public const int DEFAULT_MIN_DOC_FREQ = 5;
+
+ /// <summary>
+ /// Ignore words wich occur in more than this many docs
+ /// </summary>
+ /// <seealso cref="GetMaxDocFreq"/>
+ /// <seealso cref="SetMaxDocFreq"/>
+ /// <seealso cref="SetMaxDocFreqPct"/>
+ public const int DEFAULT_MAX_DOC_FREQ = int.MaxValue;
/// <summary> Boost terms in query based on score.</summary>
/// <seealso cref="IsBoost">
@@ -194,10 +205,10 @@ namespace Lucene.Net.Search.Similar
/// </seealso>
/// <seealso cref="GetStopWords">
/// </seealso>
- public static readonly System.Collections.Hashtable DEFAULT_STOP_WORDS = null;
+ public static readonly ISet<string> DEFAULT_STOP_WORDS = null;
/// <summary> Current set of stop words.</summary>
- private System.Collections.Hashtable stopWords = DEFAULT_STOP_WORDS;
+ private ISet<string> stopWords = DEFAULT_STOP_WORDS;
/// <summary> Return a Query with no more than this many terms.
///
@@ -217,7 +228,12 @@ namespace Lucene.Net.Search.Similar
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
/// <summary> Ignore words which do not occur in at least this many docs.</summary>
- private int minDocFreq = DEFALT_MIN_DOC_FREQ;
+ private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
+
+ /// <summary>
+ /// Ignore words which occur in more than this many docs.
+ /// </summary>
+ private int maxDocfreq = DEFAULT_MAX_DOC_FREQ;
/// <summary> Should we apply a boost to the Query based on the scores?</summary>
private bool boost = DEFAULT_BOOST;
@@ -228,8 +244,6 @@ namespace Lucene.Net.Search.Similar
/// <summary> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support</summary>
private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
-
-
/// <summary> Ignore words if less than this len.</summary>
private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
@@ -266,7 +280,7 @@ namespace Lucene.Net.Search.Similar
}
/// <summary> Constructor requiring an IndexReader.</summary>
- public MoreLikeThis(IndexReader ir) : this(ir,new DefaultSimilarity() )
+ public MoreLikeThis(IndexReader ir) : this(ir,new DefaultSimilarity())
{
}
@@ -332,7 +346,7 @@ namespace Lucene.Net.Search.Similar
}
/// <summary> Returns the frequency at which words will be ignored which do not occur in at least this
- /// many docs. The default frequency is <see cref="DEFALT_MIN_DOC_FREQ"/>.
+ /// many docs. The default frequency is <see cref="DEFAULT_MIN_DOC_FREQ"/>.
///
/// </summary>
/// <returns> the frequency at which words will be ignored which do not occur in at least this
@@ -355,6 +369,43 @@ namespace Lucene.Net.Search.Similar
this.minDocFreq = minDocFreq;
}
+ /// <summary>
+ /// Returns the maximum frequency in which words may still appear.
+ /// Words that appear in more than this many docs will be ignored. The default frequency is
+ /// <see cref="DEFAULT_MAX_DOC_FREQ"/>
+ /// </summary>
+ /// <returns>get the maximum frequency at which words are still allowed,
+ /// words which occur in more docs than this are ignored.</returns>
+ public int GetMaxDocFreq()
+ {
+ return this.maxDocfreq;
+ }
+
+ /// <summary>
+ /// Set the maximum frequency in which words may still appear. Words that appear
+ /// in more than this many docs will be ignored.
+ /// </summary>
+ /// <param name="maxFreq">
+ /// the maximum count of documents that a term may appear
+ /// in to be still considered relevant</param>
+ public void SetMaxDocFreq(int maxFreq)
+ {
+ this.maxDocfreq = maxFreq;
+ }
+
+ /// <summary>
+ /// Set the maximum percentage in which words may still appear. Words that appear
+ /// in more than this many percent of all docs will be ignored.
+ /// </summary>
+ /// <param name="maxPercentage">
+ /// the maximum percentage of documents (0-100) that a term may appear
+ /// in to be still considered relevant
+ /// </param>
+ public void SetMaxDocFreqPct(int maxPercentage)
+ {
+ this.maxDocfreq = maxPercentage * ir.NumDocs() / 100;
+ }
+
/// <summary> Returns whether to boost terms in query based on "score" or not. The default is
/// <see cref="DEFAULT_BOOST"/>.
///
@@ -459,7 +510,7 @@ namespace Lucene.Net.Search.Similar
/// </seealso>
/// <seealso cref="GetStopWords">
/// </seealso>
- public void SetStopWords(System.Collections.Hashtable stopWords)
+ public void SetStopWords(ISet<string> stopWords)
{
this.stopWords = stopWords;
}
@@ -467,7 +518,7 @@ namespace Lucene.Net.Search.Similar
/// <summary> Get the current stop words being used.</summary>
/// <seealso cref="SetStopWords">
/// </seealso>
- public System.Collections.Hashtable GetStopWords()
+ public ISet<string> GetStopWords()
{
return stopWords;
}
@@ -511,24 +562,16 @@ namespace Lucene.Net.Search.Similar
maxNumTokensParsed = i;
}
-
-
-
- /// <summary> Return a query that will return docs like the passed lucene document ID.
- ///
- /// </summary>
- /// <param name="docNum">the documentID of the lucene doc to generate the 'More Like This" query for.
- /// </param>
- /// <returns> a query that will return docs like the passed lucene document ID.
- /// </returns>
+ /// <summary>Return a query that will return docs like the passed lucene document ID.</summary>
+ /// <param name="docNum">the documentID of the lucene doc to generate the 'More Like This" query for.</param>
+ /// <returns> a query that will return docs like the passed lucene document ID.</returns>
public Query Like(int docNum)
{
if (fieldNames == null)
{
// gather list of valid fields from lucene
- System.Collections.Generic.ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
- fieldNames = new string[fields.Count];
- fields.CopyTo(fieldNames, 0);
+ ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+ fieldNames = fields.ToArray();
}
return CreateQuery(RetrieveTerms(docNum));
@@ -544,9 +587,8 @@ namespace Lucene.Net.Search.Similar
if (fieldNames == null)
{
// gather list of valid fields from lucene
- System.Collections.Generic.ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
- fieldNames = new string[fields.Count];
- fields.CopyTo(fieldNames, 0);
+ ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+ fieldNames = fields.ToArray();
}
return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
@@ -559,7 +601,7 @@ namespace Lucene.Net.Search.Similar
/// </returns>
public Query Like(System.Uri u)
{
- return Like(new System.IO.StreamReader(((System.Net.HttpWebRequest)System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
+ return Like(new System.IO.StreamReader((System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
}
/// <summary> Return a query that will return docs like the passed stream.
@@ -583,7 +625,7 @@ namespace Lucene.Net.Search.Similar
}
/// <summary> Create the More like query from a PriorityQueue</summary>
- private Query CreateQuery(PriorityQueue q)
+ private Query CreateQuery(PriorityQueue<object[]> q)
{
BooleanQuery query = new BooleanQuery();
System.Object cur;
@@ -599,9 +641,9 @@ namespace Lucene.Net.Search.Similar
{
if (qterms == 0)
{
- bestScore = (float)((System.Single)ar[2]);
+ bestScore = (float)ar[2];
}
- float myScore = (float)((System.Single)ar[2]);
+ float myScore = (float)ar[2];
tq.SetBoost(boostFactor * myScore / bestScore);
}
@@ -630,19 +672,19 @@ namespace Lucene.Net.Search.Similar
/// </summary>
/// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
/// </param>
- private PriorityQueue CreateQueue(System.Collections.IDictionary words)
+ private PriorityQueue<object[]> CreateQueue(IDictionary<string,Int> words)
{
// have collected all words in doc and their freqs
int numDocs = ir.NumDocs();
FreqQ res = new FreqQ(words.Count); // will order words by score
- System.Collections.IEnumerator it = words.Keys.GetEnumerator();
+ var it = words.Keys.GetEnumerator();
while (it.MoveNext())
{
// for every word
- System.String word = (System.String)it.Current;
+ System.String word = it.Current;
- int tf = ((Int)words[word]).x; // term freq in the source doc
+ int tf = words[word].x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq)
{
continue; // filter out words that don't occur enough times in the source
@@ -663,6 +705,11 @@ namespace Lucene.Net.Search.Similar
continue; // filter out words that don't occur in enough docs
}
+ if (docFreq > maxDocfreq)
+ {
+ continue; // filter out words that occur in too many docs
+ }
+
if (docFreq == 0)
{
continue; // index update problem?
@@ -672,7 +719,7 @@ namespace Lucene.Net.Search.Similar
float score = tf * idf;
// only really need 1st 3 entries, other ones are for troubleshooting
- res.Insert(new System.Object[] { word, topField, (float)score, (float)idf, (System.Int32)docFreq, (System.Int32)tf });
+ res.InsertWithOverflow(new System.Object[] { word, topField, score, idf, docFreq, tf });
}
return res;
}
@@ -728,7 +775,8 @@ namespace Lucene.Net.Search.Similar
temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
temp_writer.AutoFlush = true;
System.IO.StreamWriter o = temp_writer;
- IndexReader r = IndexReader.Open(indexName);
+ FSDirectory dir = FSDirectory.Open(new DirectoryInfo(indexName));
+ IndexReader r = IndexReader.Open(dir, true);
o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");
MoreLikeThis mlt = new MoreLikeThis(r);
@@ -751,17 +799,18 @@ namespace Lucene.Net.Search.Similar
o.WriteLine("q: " + query);
o.WriteLine();
- IndexSearcher searcher = new IndexSearcher(indexName);
+ IndexSearcher searcher = new IndexSearcher(dir, true);
- Hits hits = searcher.Search(query);
- int len = hits.Length();
+ TopDocs hits = searcher.Search(query, null, 25);
+ int len = hits.TotalHits;
o.WriteLine("found: " + len + " documents matching");
o.WriteLine();
+ ScoreDoc[] scoreDocs = hits.ScoreDocs;
for (int i = 0; i < System.Math.Min(25, len); i++)
{
- Document d = hits.Doc(i);
+ Document d = searcher.Doc(scoreDocs[i].doc);
System.String summary = d.Get("summary");
- o.WriteLine("score : " + hits.Score(i));
+ o.WriteLine("score : " + scoreDocs[i].score);
o.WriteLine("url : " + d.Get("url"));
o.WriteLine("\ttitle : " + d.Get("title"));
if (summary != null)
@@ -775,9 +824,9 @@ namespace Lucene.Net.Search.Similar
/// </summary>
/// <param name="docNum">the id of the lucene document from which to find terms
/// </param>
- private PriorityQueue RetrieveTerms(int docNum)
+ private PriorityQueue<object[]> RetrieveTerms(int docNum)
{
- System.Collections.IDictionary termFreqMap = new System.Collections.Hashtable();
+ IDictionary<string,Int> termFreqMap = new HashMap<string,Int>();
for (int i = 0; i < fieldNames.Length; i++)
{
System.String fieldName = fieldNames[i];
@@ -810,7 +859,7 @@ namespace Lucene.Net.Search.Similar
/// </param>
/// <param name="vector">List of terms and their frequencies for a doc/field
/// </param>
- private void AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector)
+ private void AddTermFrequencies(IDictionary<string, Int> termFreqMap, TermFreqVector vector)
{
System.String[] terms = vector.GetTerms();
int[] freqs = vector.GetTermFrequencies();
@@ -823,7 +872,7 @@ namespace Lucene.Net.Search.Similar
continue;
}
// increment frequency
- Int cnt = (Int)termFreqMap[term];
+ Int cnt = termFreqMap[term];
if (cnt == null)
{
cnt = new Int();
@@ -843,12 +892,12 @@ namespace Lucene.Net.Search.Similar
/// </param>
/// <param name="fieldName">Used by analyzer for any special per-field analysis
/// </param>
- private void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
+ private void AddTermFrequencies(System.IO.TextReader r, IDictionary<string,Int> termFreqMap, System.String fieldName)
{
TokenStream ts = analyzer.TokenStream(fieldName, r);
int tokenCount=0;
// for every token
- TermAttribute termAtt = (TermAttribute) ts.AddAttribute(typeof(TermAttribute));
+ TermAttribute termAtt = ts.AddAttribute<TermAttribute>();
while (ts.IncrementToken()) {
string word = termAtt.Term();
@@ -862,7 +911,7 @@ namespace Lucene.Net.Search.Similar
}
// increment frequency
- Int cnt = (Int) termFreqMap[word];
+ Int cnt = termFreqMap[word];
if (cnt == null) {
termFreqMap[word] = new Int();
}
@@ -923,9 +972,9 @@ namespace Lucene.Net.Search.Similar
/// </returns>
/// <seealso cref="RetrieveInterestingTerms(System.IO.TextReader)">
/// </seealso>
- public PriorityQueue RetrieveTerms(System.IO.TextReader r)
+ public PriorityQueue<object[]> RetrieveTerms(System.IO.TextReader r)
{
- System.Collections.IDictionary words = new System.Collections.Hashtable();
+ IDictionary<string, Int> words = new HashMap<string,Int>();
for (int i = 0; i < fieldNames.Length; i++)
{
System.String fieldName = fieldNames[i];
@@ -937,8 +986,8 @@ namespace Lucene.Net.Search.Similar
public System.String[] RetrieveInterestingTerms(int docNum)
{
- System.Collections.ArrayList al = new System.Collections.ArrayList(maxQueryTerms);
- PriorityQueue pq = RetrieveTerms(docNum);
+ List<object> al = new List<object>(maxQueryTerms);
+ PriorityQueue<object[]> pq = RetrieveTerms(docNum);
System.Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
@@ -947,9 +996,9 @@ namespace Lucene.Net.Search.Similar
System.Object[] ar = (System.Object[])cur;
al.Add(ar[0]); // the 1st entry is the interesting word
}
- System.String[] res = new System.String[al.Count];
- // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
- return (System.String[])al.ToArray(typeof(System.String));
+ //System.String[] res = new System.String[al.Count];
+ //return al.toArray(res);
+ return al.Select(x => x.ToString()).ToArray();
}
/// <summary> Convenience routine to make it easy to return the most interesting words in a document.
@@ -966,8 +1015,8 @@ namespace Lucene.Net.Search.Similar
/// </seealso>
public System.String[] RetrieveInterestingTerms(System.IO.TextReader r)
{
- System.Collections.ArrayList al = new System.Collections.ArrayList(maxQueryTerms);
- PriorityQueue pq = RetrieveTerms(r);
+ List<object> al = new List<object>(maxQueryTerms);
+ PriorityQueue<object[]> pq = RetrieveTerms(r);
System.Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
@@ -976,25 +1025,23 @@ namespace Lucene.Net.Search.Similar
System.Object[] ar = (System.Object[])cur;
al.Add(ar[0]); // the 1st entry is the interesting word
}
- System.String[] res = new System.String[al.Count];
+ //System.String[] res = new System.String[al.Count];
// return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
- return (System.String[])al.ToArray(typeof(System.String));
+ return al.Select(x => x.ToString()).ToArray();
}
/// <summary> PriorityQueue that orders words by score.</summary>
- private class FreqQ : PriorityQueue
+ private class FreqQ : PriorityQueue<object[]>
{
internal FreqQ(int s)
{
Initialize(s);
}
- override public bool LessThan(System.Object a, System.Object b)
+ override public bool LessThan(System.Object[] aa, System.Object[] bb)
{
- System.Object[] aa = (System.Object[])a;
- System.Object[] bb = (System.Object[])b;
- System.Single fa = (System.Single)aa[2];
- System.Single fb = (System.Single)bb[2];
+ float fa = (float)aa[2];
+ float fb = (float)bb[2];
return (float)fa > (float)fb;
}
}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThisQuery.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThisQuery.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThisQuery.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/MoreLikeThisQuery.cs Wed Nov 23 06:55:21 2011
@@ -35,15 +35,13 @@ namespace Lucene.Net.Search.Similar
*/
public class MoreLikeThisQuery : Query
{
-
-
private String likeText;
private String[] moreLikeFields;
private Analyzer analyzer;
float percentTermsToMatch = 0.3f;
int minTermFrequency = 1;
int maxQueryTerms = 5;
- System.Collections.Hashtable stopWords = null;
+ ISet<string> stopWords = null;
int minDocFreq = -1;
@@ -144,11 +142,11 @@ namespace Lucene.Net.Search.Similar
{
this.moreLikeFields = moreLikeFields;
}
- public System.Collections.Hashtable GetStopWords()
+ public ISet<string> GetStopWords()
{
return stopWords;
}
- public void SetStopWords(System.Collections.Hashtable stopWords)
+ public void SetStopWords(ISet<string> stopWords)
{
this.stopWords = stopWords;
}
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/SimilarityQueries.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/SimilarityQueries.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/SimilarityQueries.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/Similar/SimilarityQueries.cs Wed Nov 23 06:55:21 2011
@@ -16,6 +16,7 @@
*/
using System;
+using System.Collections.Generic;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using TokenStream = Lucene.Net.Analysis.TokenStream;
using Term = Lucene.Net.Index.Term;
@@ -82,13 +83,13 @@ namespace Similarity.Net
/// <returns> a query with all unique words in 'body'
/// </returns>
/// <throws> IOException this can't happen... </throws>
- public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
+ public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet<string> stop)
{
TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
- TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));
+ TermAttribute termAtt = ts.AddAttribute<TermAttribute>();
BooleanQuery tmp = new BooleanQuery();
- System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
+ ISet<string> already = new HashSet<string>(); // ignore dups
while (ts.IncrementToken())
{
String word = termAtt.Term();
@@ -96,9 +97,9 @@ namespace Similarity.Net
if (stop != null && stop.Contains(word))
continue;
// ignore dups
- if (already.Contains(word) == true)
+ if (already.Contains(word))
continue;
- already.Add(word, word);
+ already.Add(word);
// add to query
TermQuery tq = new TermQuery(new Term(field, word));
try
Modified: incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/TermsFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/TermsFilter.cs?rev=1205303&r1=1205302&r2=1205303&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/TermsFilter.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net.3.0.3/trunk/src/contrib/Queries/TermsFilter.cs Wed Nov 23 06:55:21 2011
@@ -33,7 +33,7 @@ namespace Lucene.Net.Search
/// <summary>
/// The set of terms for this filter.
/// </summary>
- protected HashSet<Term> terms = new HashSet<Term>();
+ protected ISet<Term> terms = new SortedSet<Term>();
/// <summary>
/// Add a term to the set.
@@ -83,6 +83,7 @@ namespace Lucene.Net.Search
return false;
}
TermsFilter test = (TermsFilter)obj;
+ // TODO: Does SortedSet have an issues like List<T>? see EquatableList in Support
return (terms == test.terms || (terms != null && terms.Equals(test.terms)));
}