You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ar...@apache.org on 2007/03/10 20:26:34 UTC
svn commit: r516769 - in /incubator/lucene.net/trunk/C#/contrib: ./
Similarity.Net/ Similarity.Net/Similarity.Net/
Similarity.Net/Similarity.Net/Similar/
Author: aroush
Date: Sat Mar 10 11:26:32 2007
New Revision: 516769
URL: http://svn.apache.org/viewvc?view=rev&rev=516769
Log:
Added Similarity.Net
Added:
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj
incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln
Modified:
incubator/lucene.net/trunk/C#/contrib/README.txt
Modified: incubator/lucene.net/trunk/C#/contrib/README.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/README.txt?view=diff&rev=516769&r1=516768&r2=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/README.txt (original)
+++ incubator/lucene.net/trunk/C#/contrib/README.txt Sat Mar 10 11:26:32 2007
@@ -8,6 +8,7 @@
Snowball.Net
SpellChecker.Net
WordNet.Net
+Similarity.Net
Contributed code:
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/ABOUT.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1 @@
+Similarity.Net is a port of Java Similarity to C#. The port from Java to C# of version 2.0.0 is done by George Aroush. To contact George Aroush please visit http://www.aroush.net/
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/HISTORY.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1,6 @@
+Similarity.Net History
+----------------------
+
+
+10Mar07:
+ - Release: Similarity.Net 2.0.0 build 001
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs Sat Mar 10 11:26:32 2007
@@ -0,0 +1,64 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+
+//
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+//
+[assembly: AssemblyTitle("Apache Lucene.Net (Similarity)")]
+[assembly: AssemblyDescription("The Apache Software Foundation Lucene.Net a full-text search engine library")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Similarity.Net")]
+[assembly: AssemblyCopyright("Copyright 2007 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2007 The Apache Software Foundation")]
+[assembly: AssemblyDefaultAlias("Lucene.Net.Similarity")]
+[assembly: AssemblyCulture("")]
+
+[assembly: AssemblyInformationalVersionAttribute("2.0")]
+
+
+// Version information for an assembly consists of the following four values:
+//
+// Major Version
+// Minor Version
+// Revision
+// Build Number
+//
+// You can specify all the values or you can default the Revision and Build Numbers
+// by using the '*' as shown below:
+
+[assembly: AssemblyVersion("2.0.0.1")]
+
+//
+// In order to sign your assembly you must specify a key to use. Refer to the
+// Microsoft .NET Framework documentation for more information on assembly signing.
+//
+// Use the attributes below to control which key is used for signing.
+//
+// Notes:
+// (*) If no key is specified, the assembly is not signed.
+// (*) KeyName refers to a key that has been installed in the Crypto Service
+// Provider (CSP) on your machine. KeyFile refers to a file which contains
+// a key.
+// (*) If the KeyFile and the KeyName values are both specified, the
+// following processing occurs:
+// (1) If the KeyName can be found in the CSP, that key is used.
+// (2) If the KeyName does not exist and the KeyFile does exist, the key
+// in the KeyFile is installed into the CSP and used.
+// (*) In order to create a KeyFile, you can use the sn.exe (Strong Name) utility.
+// When specifying the KeyFile, the location of the KeyFile should be
+// relative to the project output directory which is
+// %Project Directory%\obj\<configuration>. For example, if your KeyFile is
+// located in the project directory, you would specify the AssemblyKeyFile
+// attribute as [assembly: AssemblyKeyFile("..\..\mykey.snk")]
+// (*) Delay Signing is an advanced option - see the Microsoft .NET Framework
+// documentation for more information on this.
+//
+
+[assembly: AssemblyDelaySign(false)]
+[assembly: AssemblyKeyFile("")]
+[assembly: AssemblyKeyName("")]
+
+
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Build.xml?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml Sat Mar 10 11:26:32 2007
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+
+<project name="similarity" default="default">
+
+ <description>
+ Similarity - MoreLikeThis
+ </description>
+
+ <import file="../contrib-build.xml"/>
+</project>
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/HISTORY.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1,6 @@
+Similarity.Net History
+----------------------
+
+
+11Mar07:
+ - Release: Similarity.Net 2.0.0 build 001
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/README.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1,3 @@
+Document similarity measures.
+This most significant contribution here is MoreLikeThis,
+in /src/java/org/apache/lucene/search/similar.
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs Sat Mar 10 11:26:32 2007
@@ -0,0 +1,975 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using PriorityQueue = Lucene.Net.Util.PriorityQueue;
+using IndexReader = Lucene.Net.Index.IndexReader;
+using Term = Lucene.Net.Index.Term;
+using TermFreqVector = Lucene.Net.Index.TermFreqVector;
+using BooleanClause = Lucene.Net.Search.BooleanClause;
+using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity;
+using TermQuery = Lucene.Net.Search.TermQuery;
+using BooleanQuery = Lucene.Net.Search.BooleanQuery;
+using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+using Query = Lucene.Net.Search.Query;
+using Hits = Lucene.Net.Search.Hits;
+using Analyzer = Lucene.Net.Analysis.Analyzer;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
+using Document = Lucene.Net.Documents.Document;
+
+namespace Similarity.Net
+{
+
+
+ /// <summary> Generate "more like this" similarity queries.
+ /// Based on this mail:
+ /// <code><pre>
+ /// Lucene does let you access the document frequency of terms, with IndexReader.DocFreq().
+ /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
+ /// is usually fast enough. But looking up the DocFreq() of every term in the document is
+ /// probably too slow.
+ ///
+ /// You can use some heuristics to prune the set of terms, to avoid calling DocFreq() too much,
+ /// or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
+ /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
+ /// reduce the number of terms under consideration. Another heuristic is that terms with a
+ /// high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
+ /// number of characters, not selecting anything less than, e.g., six or seven characters.
+ /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
+ /// that do a pretty good job of characterizing a document.
+ ///
+ /// It all depends on what you're trying to do. If you're trying to eek out that last percent
+ /// of precision and recall regardless of computational difficulty so that you can win a TREC
+ /// competition, then the techniques I mention above are useless. But if you're trying to
+ /// provide a "more like this" button on a search results page that does a decent job and has
+ /// good performance, such techniques might be useful.
+ ///
+ /// An efficient, effective "more-like-this" query generator would be a great contribution, if
+ /// anyone's interested. I'd imagine that it would take a Reader or a String (the document's
+ /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
+ /// above. The frequency and length thresholds could be parameters, etc.
+ ///
+ /// Doug
+ /// </pre></code>
+ ///
+ ///
+ /// <p>
+ /// <h3>Initial Usage</h3>
+ ///
+ /// This class has lots of options to try to make it efficient and flexible.
+ /// See the body of {@link #main Main()} below in the source for real code, or
+ /// if you want pseudo code, the simpliest possible usage is as follows. The bold
+ /// fragment is specific to this class.
+ ///
+ /// <code><pre>
+ ///
+ /// IndexReader ir = ...
+ /// IndexSearcher is = ...
+ /// <b>
+ /// MoreLikeThis mlt = new MoreLikeThis(ir);
+ /// Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
+ /// Query query = mlt.Like( target);
+ /// </b>
+ /// Hits hits = is.Search(query);
+ /// <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure
+ /// you ignore the doc if it matches your 'target' document, as it should be similar to itself </em>
+ ///
+ /// </pre></code>
+ ///
+ /// Thus you:
+ /// <ol>
+ /// <li> do your normal, Lucene setup for searching,
+ /// <li> create a MoreLikeThis,
+ /// <li> get the text of the doc you want to find similaries to
+ /// <li> then call one of the Like() calls to generate a similarity query
+ /// <li> call the searcher to find the similar docs
+ /// </ol>
+ ///
+ /// <h3>More Advanced Usage</h3>
+ ///
+ /// You may want to use {@link #SetFieldNames SetFieldNames(...)} so you can examine
+ /// multiple fields (e.g. body and title) for similarity.
+ /// <p>
+ ///
+ /// Depending on the size of your index and the size and makeup of your documents you
+ /// may want to call the other set methods to control how the similarity queries are
+ /// generated:
+ /// <ul>
+ /// <li> {@link #SetMinTermFreq SetMinTermFreq(...)}
+ /// <li> {@link #SetMinDocFreq SetMinDocFreq(...)}
+ /// <li> {@link #SetMinWordLen SetMinWordLen(...)}
+ /// <li> {@link #SetMaxWordLen SetMaxWordLen(...)}
+ /// <li> {@link #SetMaxQueryTerms SetMaxQueryTerms(...)}
+ /// <li> {@link #SetMaxNumTokensParsed SetMaxNumTokensParsed(...)}
+ /// <li> {@link #SetStopWords SetStopWord(...)}
+ /// </ul>
+ ///
+ /// <hr>
+ /// <pre>
+ /// Changes: Mark Harwood 29/02/04
+ /// Some bugfixing, some refactoring, some optimisation.
+ /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
+ /// - bugfix: No significant terms being created for fields with a termvector - because
+ /// was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector)
+ /// - refactor: moved common code into isNoiseWord()
+ /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
+ /// </pre>
+ ///
+ /// </summary>
+ /// <author> David Spencer
+ /// </author>
+ /// <author> Bruce Ritchie
+ /// </author>
+ /// <author> Mark Harwood
+ /// </author>
+ public sealed class MoreLikeThis
+ {
+
+ /// <summary> Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.</summary>
+ /// <seealso cref="#getMaxNumTokensParsed">
+ /// </seealso>
+ public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
+
+
+ /// <summary> Default analyzer to parse source doc with.</summary>
+ /// <seealso cref="#getAnalyzer">
+ /// </seealso>
+ public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
+
+ /// <summary> Ignore terms with less than this frequency in the source doc.</summary>
+ /// <seealso cref="#getMinTermFreq">
+ /// </seealso>
+ /// <seealso cref="#setMinTermFreq">
+ /// </seealso>
+ public const int DEFAULT_MIN_TERM_FREQ = 2;
+
+ /// <summary> Ignore words which do not occur in at least this many docs.</summary>
+ /// <seealso cref="#getMinDocFreq">
+ /// </seealso>
+ /// <seealso cref="#setMinDocFreq">
+ /// </seealso>
+ public const int DEFALT_MIN_DOC_FREQ = 5;
+
+ /// <summary> Boost terms in query based on score.</summary>
+ /// <seealso cref="#isBoost">
+ /// </seealso>
+ /// <seealso cref="#SetBoost">
+ /// </seealso>
+ public const bool DEFAULT_BOOST = false;
+
+ /// <summary> Default field names. Null is used to specify that the field names should be looked
+ /// up at runtime from the provided reader.
+ /// </summary>
+ public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[]{"contents"};
+
+ /// <summary> Ignore words less than this length or if 0 then this has no effect.</summary>
+ /// <seealso cref="#getMinWordLen">
+ /// </seealso>
+ /// <seealso cref="#setMinWordLen">
+ /// </seealso>
+ public const int DEFAULT_MIN_WORD_LENGTH = 0;
+
+ /// <summary> Ignore words greater than this length or if 0 then this has no effect.</summary>
+ /// <seealso cref="#getMaxWordLen">
+ /// </seealso>
+ /// <seealso cref="#setMaxWordLen">
+ /// </seealso>
+ public const int DEFAULT_MAX_WORD_LENGTH = 0;
+
+ /// <summary> Default set of stopwords.
+ /// If null means to allow stop words.
+ ///
+ /// </summary>
+ /// <seealso cref="#setStopWords">
+ /// </seealso>
+ /// <seealso cref="#getStopWords">
+ /// </seealso>
+ public static readonly System.Collections.Hashtable DEFAULT_STOP_WORDS = null;
+
+ /// <summary> Current set of stop words.</summary>
+ private System.Collections.Hashtable stopWords = DEFAULT_STOP_WORDS;
+
+ /// <summary> Return a Query with no more than this many terms.
+ ///
+ /// </summary>
+ /// <seealso cref="BooleanQuery#getMaxClauseCount">
+ /// </seealso>
+ /// <seealso cref="#getMaxQueryTerms">
+ /// </seealso>
+ /// <seealso cref="#setMaxQueryTerms">
+ /// </seealso>
+ public const int DEFAULT_MAX_QUERY_TERMS = 25;
+
+ /// <summary> Analyzer that will be used to parse the doc.</summary>
+ private Analyzer analyzer = DEFAULT_ANALYZER;
+
+ /// <summary> Ignore words less freqent that this.</summary>
+ private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
+
+ /// <summary> Ignore words which do not occur in at least this many docs.</summary>
+ private int minDocFreq = DEFALT_MIN_DOC_FREQ;
+
+ /// <summary> Should we apply a boost to the Query based on the scores?</summary>
+ private bool boost = DEFAULT_BOOST;
+
+ /// <summary> Field name we'll analyze.</summary>
+ private System.String[] fieldNames = DEFAULT_FIELD_NAMES;
+
+ /// <summary> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support</summary>
+ private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
+
+
+
+ /// <summary> Ignore words if less than this len.</summary>
+ private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
+
+ /// <summary> Ignore words if greater than this len.</summary>
+ private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
+
+ /// <summary> Don't return a query longer than this.</summary>
+ private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
+
+ /// <summary> For idf() calculations.</summary>
+ private Lucene.Net.Search.Similarity similarity = new DefaultSimilarity();
+
+ /// <summary> IndexReader to use</summary>
+ private IndexReader ir;
+
+ /// <summary> Constructor requiring an IndexReader.</summary>
+ public MoreLikeThis(IndexReader ir)
+ {
+ this.ir = ir;
+ }
+
+ /// <summary> Returns an analyzer that will be used to parse source doc with. The default analyzer
+ /// is the {@link #DEFAULT_ANALYZER}.
+ ///
+ /// </summary>
+ /// <returns> the analyzer that will be used to parse source doc with.
+ /// </returns>
+ /// <seealso cref="#DEFAULT_ANALYZER">
+ /// </seealso>
+ public Analyzer GetAnalyzer()
+ {
+ return analyzer;
+ }
+
+ /// <summary> Sets the analyzer to use. An analyzer is not required for generating a query with the
+ /// {@link #Like(int)} method, all other 'like' methods require an analyzer.
+ ///
+ /// </summary>
+ /// <param name="analyzer">the analyzer to use to tokenize text.
+ /// </param>
+ public void SetAnalyzer(Analyzer analyzer)
+ {
+ this.analyzer = analyzer;
+ }
+
+ /// <summary> Returns the frequency below which terms will be ignored in the source doc. The default
+ /// frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
+ ///
+ /// </summary>
+ /// <returns> the frequency below which terms will be ignored in the source doc.
+ /// </returns>
+ public int GetMinTermFreq()
+ {
+ return minTermFreq;
+ }
+
+ /// <summary> Sets the frequency below which terms will be ignored in the source doc.
+ ///
+ /// </summary>
+ /// <param name="minTermFreq">the frequency below which terms will be ignored in the source doc.
+ /// </param>
+ public void SetMinTermFreq(int minTermFreq)
+ {
+ this.minTermFreq = minTermFreq;
+ }
+
+ /// <summary> Returns the frequency at which words will be ignored which do not occur in at least this
+ /// many docs. The default frequency is {@link #DEFALT_MIN_DOC_FREQ}.
+ ///
+ /// </summary>
+ /// <returns> the frequency at which words will be ignored which do not occur in at least this
+ /// many docs.
+ /// </returns>
+ public int GetMinDocFreq()
+ {
+ return minDocFreq;
+ }
+
+ /// <summary> Sets the frequency at which words will be ignored which do not occur in at least this
+ /// many docs.
+ ///
+ /// </summary>
+ /// <param name="minDocFreq">the frequency at which words will be ignored which do not occur in at
+ /// least this many docs.
+ /// </param>
+ public void SetMinDocFreq(int minDocFreq)
+ {
+ this.minDocFreq = minDocFreq;
+ }
+
+ /// <summary> Returns whether to boost terms in query based on "score" or not. The default is
+ /// {@link #DEFAULT_BOOST}.
+ ///
+ /// </summary>
+ /// <returns> whether to boost terms in query based on "score" or not.
+ /// </returns>
+ /// <seealso cref="#SetBoost">
+ /// </seealso>
+ public bool IsBoost()
+ {
+ return boost;
+ }
+
+ /// <summary> Sets whether to boost terms in query based on "score" or not.
+ ///
+ /// </summary>
+ /// <param name="boost">true to boost terms in query based on "score", false otherwise.
+ /// </param>
+ /// <seealso cref="#isBoost">
+ /// </seealso>
+ public void SetBoost(bool boost)
+ {
+ this.boost = boost;
+ }
+
+ /// <summary> Returns the field names that will be used when generating the 'More Like This' query.
+ /// The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
+ ///
+ /// </summary>
+ /// <returns> the field names that will be used when generating the 'More Like This' query.
+ /// </returns>
+ public System.String[] GetFieldNames()
+ {
+ return fieldNames;
+ }
+
+ /// <summary> Sets the field names that will be used when generating the 'More Like This' query.
+ /// Set this to null for the field names to be determined at runtime from the IndexReader
+ /// provided in the constructor.
+ ///
+ /// </summary>
+ /// <param name="fieldNames">the field names that will be used when generating the 'More Like This'
+ /// query.
+ /// </param>
+ public void SetFieldNames(System.String[] fieldNames)
+ {
+ this.fieldNames = fieldNames;
+ }
+
+ /// <summary> Returns the minimum word length below which words will be ignored. Set this to 0 for no
+ /// minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
+ ///
+ /// </summary>
+ /// <returns> the minimum word length below which words will be ignored.
+ /// </returns>
+ public int GetMinWordLen()
+ {
+ return minWordLen;
+ }
+
+ /// <summary> Sets the minimum word length below which words will be ignored.
+ ///
+ /// </summary>
+ /// <param name="minWordLen">the minimum word length below which words will be ignored.
+ /// </param>
+ public void SetMinWordLen(int minWordLen)
+ {
+ this.minWordLen = minWordLen;
+ }
+
+ /// <summary> Returns the maximum word length above which words will be ignored. Set this to 0 for no
+ /// maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
+ ///
+ /// </summary>
+ /// <returns> the maximum word length above which words will be ignored.
+ /// </returns>
+ public int GetMaxWordLen()
+ {
+ return maxWordLen;
+ }
+
+ /// <summary> Sets the maximum word length above which words will be ignored.
+ ///
+ /// </summary>
+ /// <param name="maxWordLen">the maximum word length above which words will be ignored.
+ /// </param>
+ public void SetMaxWordLen(int maxWordLen)
+ {
+ this.maxWordLen = maxWordLen;
+ }
+
+ /// <summary> Set the set of stopwords.
+ /// Any word in this set is considered "uninteresting" and ignored.
+ /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
+ /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
+ ///
+ /// </summary>
+ /// <param name="stopWords">set of stopwords, if null it means to allow stop words
+ ///
+ /// </param>
+ /// <seealso cref="StopFilter.makeStopSet()">
+ /// </seealso>
+ /// <seealso cref="#getStopWords">
+ /// </seealso>
+ public void SetStopWords(System.Collections.Hashtable stopWords)
+ {
+ this.stopWords = stopWords;
+ }
+
+ /// <summary> Get the current stop words being used.</summary>
+ /// <seealso cref="#setStopWords">
+ /// </seealso>
+ public System.Collections.Hashtable GetStopWords()
+ {
+ return stopWords;
+ }
+
+
+ /// <summary> Returns the maximum number of query terms that will be included in any generated query.
+ /// The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
+ ///
+ /// </summary>
+ /// <returns> the maximum number of query terms that will be included in any generated query.
+ /// </returns>
+ public int GetMaxQueryTerms()
+ {
+ return maxQueryTerms;
+ }
+
+ /// <summary> Sets the maximum number of query terms that will be included in any generated query.
+ ///
+ /// </summary>
+ /// <param name="maxQueryTerms">the maximum number of query terms that will be included in any
+ /// generated query.
+ /// </param>
+ public void SetMaxQueryTerms(int maxQueryTerms)
+ {
+ this.maxQueryTerms = maxQueryTerms;
+ }
+
+ /// <returns> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
+ /// </returns>
+ /// <seealso cref="#DEFAULT_MAX_NUM_TOKENS_PARSED">
+ /// </seealso>
+ public int GetMaxNumTokensParsed()
+ {
+ return maxNumTokensParsed;
+ }
+
+ /// <param name="i">The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
+ /// </param>
+ public void SetMaxNumTokensParsed(int i)
+ {
+ maxNumTokensParsed = i;
+ }
+
+
+
+
+ /// <summary> Return a query that will return docs like the passed lucene document ID.
+ ///
+ /// </summary>
+ /// <param name="docNum">the documentID of the lucene doc to generate the 'More Like This" query for.
+ /// </param>
+ /// <returns> a query that will return docs like the passed lucene document ID.
+ /// </returns>
+ public Query Like(int docNum)
+ {
+ if (fieldNames == null)
+ {
+ // gather list of valid fields from lucene
+ System.Collections.ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+ System.Collections.IEnumerator e = fields.GetEnumerator();
+ fieldNames = new System.String[fields.Count];
+ int index = 0;
+ while (e.MoveNext())
+ fieldNames[index++] = (System.String) e.Current;
+ }
+
+ return CreateQuery(RetrieveTerms(docNum));
+ }
+
+ /// <summary> Return a query that will return docs like the passed file.
+ ///
+ /// </summary>
+ /// <returns> a query that will return docs like the passed file.
+ /// </returns>
+ public Query Like(System.IO.FileInfo f)
+ {
+ if (fieldNames == null)
+ {
+ // gather list of valid fields from lucene
+ System.Collections.ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+ System.Collections.IEnumerator e = fields.GetEnumerator();
+ fieldNames = new System.String[fields.Count];
+ int index = 0;
+ while (e.MoveNext())
+ fieldNames[index++] = (System.String) e.Current;
+ }
+
+ return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
+ }
+
+ /// <summary> Return a query that will return docs like the passed URL.
+ ///
+ /// </summary>
+ /// <returns> a query that will return docs like the passed URL.
+ /// </returns>
+ public Query Like(System.Uri u)
+ {
+ return Like(new System.IO.StreamReader(((System.Net.HttpWebRequest) System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
+ }
+
+ /// <summary> Return a query that will return docs like the passed stream.
+ ///
+ /// </summary>
+ /// <returns> a query that will return docs like the passed stream.
+ /// </returns>
+ public Query Like(System.IO.Stream is_Renamed)
+ {
+ return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default));
+ }
+
+ /// <summary> Return a query that will return docs like the passed Reader.
+ ///
+ /// </summary>
+ /// <returns> a query that will return docs like the passed Reader.
+ /// </returns>
+ public Query Like(System.IO.StreamReader r)
+ {
+ return CreateQuery(RetrieveTerms(r));
+ }
+
+ /// <summary> Create the More like query from a PriorityQueue</summary>
+ private Query CreateQuery(PriorityQueue q)
+ {
+ BooleanQuery query = new BooleanQuery();
+ System.Object cur;
+ int qterms = 0;
+ float bestScore = 0;
+
+ while (((cur = q.Pop()) != null))
+ {
+ System.Object[] ar = (System.Object[]) cur;
+ TermQuery tq = new TermQuery(new Term((System.String) ar[1], (System.String) ar[0]));
+
+ if (boost)
+ {
+ if (qterms == 0)
+ {
+ bestScore = (float) ((System.Single) ar[2]);
+ }
+ float myScore = (float) ((System.Single) ar[2]);
+
+ tq.SetBoost(myScore / bestScore);
+ }
+
+ try
+ {
+ query.Add(tq, BooleanClause.Occur.SHOULD);
+ }
+ catch (BooleanQuery.TooManyClauses ignore)
+ {
+ break;
+ }
+
+ qterms++;
+ if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
+ {
+ break;
+ }
+ }
+
+ return query;
+ }
+
+ /// <summary> Create a PriorityQueue from a word->tf map.
+ ///
+ /// </summary>
+ /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
+ /// </param>
+ private PriorityQueue CreateQueue(System.Collections.IDictionary words)
+ {
+ // have collected all words in doc and their freqs
+ int numDocs = ir.NumDocs();
+ FreqQ res = new FreqQ(words.Count); // will order words by score
+
+ System.Collections.IEnumerator it = words.Keys.GetEnumerator();
+ while (it.MoveNext())
+ {
+ // for every word
+ System.String word = (System.String) it.Current;
+
+ int tf = ((Int) words[word]).x; // term freq in the source doc
+ if (minTermFreq > 0 && tf < minTermFreq)
+ {
+ continue; // filter out words that don't occur enough times in the source
+ }
+
+ // go through all the fields and find the largest document frequency
+ System.String topField = fieldNames[0];
+ int docFreq = 0;
+ for (int i = 0; i < fieldNames.Length; i++)
+ {
+ int freq = ir.DocFreq(new Term(fieldNames[i], word));
+ topField = (freq > docFreq) ? fieldNames[i] : topField;
+ docFreq = (freq > docFreq) ? freq : docFreq;
+ }
+
+ if (minDocFreq > 0 && docFreq < minDocFreq)
+ {
+ continue; // filter out words that don't occur in enough docs
+ }
+
+ if (docFreq == 0)
+ {
+ continue; // index update problem?
+ }
+
+ float idf = similarity.Idf(docFreq, numDocs);
+ float score = tf * idf;
+
+ // only really need 1st 3 entries, other ones are for troubleshooting
+ res.Insert(new System.Object[]{word, topField, (float) score, (float) idf, (System.Int32) docFreq, (System.Int32) tf});
+ }
+ return res;
+ }
+
+ /// <summary> Describe the parameters that control how the "more like this" query is formed.</summary>
+ public System.String DescribeParams()
+ {
+ System.Text.StringBuilder sb = new System.Text.StringBuilder();
+ sb.Append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
+ sb.Append("\t" + "minWordLen : " + minWordLen + "\n");
+ sb.Append("\t" + "maxWordLen : " + maxWordLen + "\n");
+ sb.Append("\t" + "fieldNames : \"");
+ System.String delim = "";
+ for (int i = 0; i < fieldNames.Length; i++)
+ {
+ System.String fieldName = fieldNames[i];
+ sb.Append(delim).Append(fieldName);
+ delim = ", ";
+ }
+ sb.Append("\n");
+ sb.Append("\t" + "boost : " + boost + "\n");
+ sb.Append("\t" + "minTermFreq : " + minTermFreq + "\n");
+ sb.Append("\t" + "minDocFreq : " + minDocFreq + "\n");
+ return sb.ToString();
+ }
+
+ /// <summary> Test driver.
+ /// Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
+ /// </summary>
+ [STAThread]
+ public static void Main(System.String[] a)
+ {
+ System.String indexName = "localhost_index";
+ System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
+ System.Uri url = null;
+ for (int i = 0; i < a.Length; i++)
+ {
+ if (a[i].Equals("-i"))
+ {
+ indexName = a[++i];
+ }
+ else if (a[i].Equals("-f"))
+ {
+ fn = a[++i];
+ }
+ else if (a[i].Equals("-url"))
+ {
+ url = new System.Uri(a[++i]);
+ }
+ }
+
+ System.IO.StreamWriter temp_writer;
+ temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
+ temp_writer.AutoFlush = true;
+ System.IO.StreamWriter o = temp_writer;
+ IndexReader r = IndexReader.Open(indexName);
+ o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");
+
+ MoreLikeThis mlt = new MoreLikeThis(r);
+
+ o.WriteLine("Query generation parameters:");
+ o.WriteLine(mlt.DescribeParams());
+ o.WriteLine();
+
+ Query query = null;
+ if (url != null)
+ {
+ o.WriteLine("Parsing URL: " + url);
+ query = mlt.Like(url);
+ }
+ else if (fn != null)
+ {
+ o.WriteLine("Parsing file: " + fn);
+ query = mlt.Like(new System.IO.FileInfo(fn));
+ }
+
+ o.WriteLine("q: " + query);
+ o.WriteLine();
+ IndexSearcher searcher = new IndexSearcher(indexName);
+
+ Hits hits = searcher.Search(query);
+ int len = hits.Length();
+ o.WriteLine("found: " + len + " documents matching");
+ o.WriteLine();
+ for (int i = 0; i < System.Math.Min(25, len); i++)
+ {
+ Document d = hits.Doc(i);
+ System.String summary = d.Get("summary");
+ o.WriteLine("score : " + hits.Score(i));
+ o.WriteLine("url : " + d.Get("url"));
+ o.WriteLine("\ttitle : " + d.Get("title"));
+ if (summary != null)
+ o.WriteLine("\tsummary: " + d.Get("summary"));
+ o.WriteLine();
+ }
+ }
+
+ /// <summary> Find words for a more-like-this query former.
+ ///
+ /// </summary>
+ /// <param name="docNum">the id of the lucene document from which to find terms
+ /// </param>
+ private PriorityQueue RetrieveTerms(int docNum)
+ {
+ System.Collections.IDictionary termFreqMap = new System.Collections.Hashtable();
+ for (int i = 0; i < fieldNames.Length; i++)
+ {
+ System.String fieldName = fieldNames[i];
+ TermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName);
+
+ // field does not store term vector info
+ if (vector == null)
+ {
+ Document d = ir.Document(docNum);
+ System.String[] text = d.GetValues(fieldName);
+ if (text != null)
+ {
+ for (int j = 0; j < text.Length; j++)
+ {
+ AddTermFrequencies(new System.IO.StreamReader(text[j]), termFreqMap, fieldName);
+ }
+ }
+ }
+ else
+ {
+ AddTermFrequencies(termFreqMap, vector);
+ }
+ }
+
+ return CreateQueue(termFreqMap);
+ }
+
+ /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
+ /// <param name="termFreqMap">a Map of terms and their frequencies
+ /// </param>
+ /// <param name="vector">List of terms and their frequencies for a doc/field
+ /// </param>
+ private void AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector)
+ {
+ System.String[] terms = vector.GetTerms();
+ int[] freqs = vector.GetTermFrequencies();
+ for (int j = 0; j < terms.Length; j++)
+ {
+ System.String term = terms[j];
+
+ if (IsNoiseWord(term))
+ {
+ continue;
+ }
+ // increment frequency
+ Int cnt = (Int) termFreqMap[term];
+ if (cnt == null)
+ {
+ cnt = new Int();
+ termFreqMap[term] = cnt;
+ cnt.x = freqs[j];
+ }
+ else
+ {
+ cnt.x += freqs[j];
+ }
+ }
+ }
+ /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
+ /// <param name="r">a source of text to be tokenized
+ /// </param>
+ /// <param name="termFreqMap">a Map of terms and their frequencies
+ /// </param>
+ /// <param name="fieldName">Used by analyzer for any special per-field analysis
+ /// </param>
+ private void AddTermFrequencies(System.IO.StreamReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
+ {
+ TokenStream ts = analyzer.TokenStream(fieldName, r);
+ Lucene.Net.Analysis.Token token;
+ int tokenCount = 0;
+ while ((token = ts.Next()) != null)
+ {
+ // for every token
+ System.String word = token.TermText();
+ tokenCount++;
+ if (tokenCount > maxNumTokensParsed)
+ {
+ break;
+ }
+ if (IsNoiseWord(word))
+ {
+ continue;
+ }
+
+ // increment frequency
+ Int cnt = (Int) termFreqMap[word];
+ if (cnt == null)
+ {
+ termFreqMap[word] = new Int();
+ }
+ else
+ {
+ cnt.x++;
+ }
+ }
+ }
+
+
+ /// <summary>determines if the passed term is likely to be of interest in "more like" comparisons
+ ///
+ /// </summary>
+ /// <param name="term">The word being considered
+ /// </param>
+ /// <returns> true if should be ignored, false if should be used in further analysis
+ /// </returns>
+ private bool IsNoiseWord(System.String term)
+ {
+ int len = term.Length;
+ if (minWordLen > 0 && len < minWordLen)
+ {
+ return true;
+ }
+ if (maxWordLen > 0 && len > maxWordLen)
+ {
+ return true;
+ }
+ if (stopWords != null && stopWords.Contains(term))
+ {
+ return true;
+ }
+ return false;
+ }
+
+
+ /// <summary> Find words for a more-like-this query former.
+ /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
+ /// Each array has 6 elements.
+ /// The elements are:
+ /// <ol>
+ /// <li> The word (String)
+ /// <li> The top field that this word comes from (String)
+ /// <li> The score for this word (Float)
+ /// <li> The IDF value (Float)
+ /// <li> The frequency of this word in the index (Integer)
+ /// <li> The frequency of this word in the source document (Integer)
+ /// </ol>
+ /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
+ /// This method is exposed so that you can identify the "interesting words" in a document.
+ /// For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
+ ///
+ /// </summary>
+ /// <param name="r">the reader that has the content of the document
+ /// </param>
+ /// <returns> the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
+ ///
+ /// </returns>
+ /// <seealso cref="#retrieveInterestingTerms">
+ /// </seealso>
+ public PriorityQueue RetrieveTerms(System.IO.StreamReader r)
+ {
+ System.Collections.IDictionary words = new System.Collections.Hashtable();
+ for (int i = 0; i < fieldNames.Length; i++)
+ {
+ System.String fieldName = fieldNames[i];
+ AddTermFrequencies(r, words, fieldName);
+ }
+ return CreateQueue(words);
+ }
+
+ /// <summary> Convenience routine to make it easy to return the most interesting words in a document.
+ /// More advanced users will call {@link #RetrieveTerms(java.io.Reader) retrieveTerms()} directly.
+ /// </summary>
+ /// <param name="r">the source document
+ /// </param>
+ /// <returns> the most interesting words in the document
+ ///
+ /// </returns>
+ /// <seealso cref="#RetrieveTerms(java.io.Reader)">
+ /// </seealso>
+ /// <seealso cref="#setMaxQueryTerms">
+ /// </seealso>
+ public System.String[] RetrieveInterestingTerms(System.IO.StreamReader r)
+ {
+ System.Collections.ArrayList al = new System.Collections.ArrayList(maxQueryTerms);
+ PriorityQueue pq = RetrieveTerms(r);
+ System.Object cur;
+ int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
+ // we just want to return the top words
+ while (((cur = pq.Pop()) != null) && lim-- > 0)
+ {
+ System.Object[] ar = (System.Object[]) cur;
+ al.Add(ar[0]); // the 1st entry is the interesting word
+ }
+ System.String[] res = new System.String[al.Count];
+ // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
+ return (System.String[]) al.ToArray(typeof(System.String));
+ }
+
+ /// <summary> PriorityQueue that orders words by score.</summary>
+ private class FreqQ : PriorityQueue
+ {
+ internal FreqQ(int s)
+ {
+ Initialize(s);
+ }
+
+ override public bool LessThan(System.Object a, System.Object b)
+ {
+ System.Object[] aa = (System.Object[]) a;
+ System.Object[] bb = (System.Object[]) b;
+ System.Single fa = (System.Single) aa[2];
+ System.Single fb = (System.Single) bb[2];
+ return (float) fa > (float) fb;
+ }
+ }
+
+ /// <summary> Use for frequencies and to avoid renewing Integers.</summary>
+ private class Int
+ {
+ internal int x;
+
+ internal Int()
+ {
+ x = 1;
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similar/Package.html?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html Sat Mar 10 11:26:32 2007
@@ -0,0 +1,5 @@
+<html>
+<body>
+Document similarity query generators.
+</body>
+</html>
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs Sat Mar 10 11:26:32 2007
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using Analyzer = Lucene.Net.Analysis.Analyzer;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+using Term = Lucene.Net.Index.Term;
+using BooleanQuery = Lucene.Net.Search.BooleanQuery;
+using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+using Query = Lucene.Net.Search.Query;
+using TermQuery = Lucene.Net.Search.TermQuery;
+using BooleanClause = Lucene.Net.Search.BooleanClause;
+
+namespace Similarity.Net
+{
+
+ /// <summary> Simple similarity measures.
+ ///
+ ///
+ /// </summary>
+ /// <seealso cref="MoreLikeThis">
+ /// </seealso>
+ public sealed class SimilarityQueries
+ {
+ /// <summary> </summary>
+ private SimilarityQueries()
+ {
+ }
+
+ /// <summary> Simple similarity query generators.
+ /// Takes every unique word and forms a boolean query where all words are optional.
+ /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
+ /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
+ /// need to then ignore that.
+ ///
+ /// <p>
+ ///
+ /// So, if you have a code fragment like this:
+ /// <br>
+ /// <code>
+ /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
+ /// </code>
+ ///
+ /// <p>
+ ///
+ /// </summary>
+ /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
+ ///
+ /// <p>
+ /// The philosophy behind this method is "two documents are similar if they share lots of words".
+ /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
+ ///
+ /// <P>
+ /// This method is fail-safe in that if a long 'body' is passed in and
+ /// {@link BooleanQuery#add BooleanQuery.add()} (used internally)
+ /// throws
+ /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
+ /// query as it is will be returned.
+ ///
+ ///
+ ///
+ ///
+ ///
+ /// </summary>
+ /// <param name="body">the body of the document you want to find similar documents to
+ /// </param>
+ /// <param name="a">the analyzer to use to parse the body
+ /// </param>
+ /// <param name="field">the field you want to search on, probably something like "contents" or "body"
+ /// </param>
+ /// <param name="stop">optional set of stop words to ignore
+ /// </param>
+ /// <returns> a query with all unique words in 'body'
+ /// </returns>
+ /// <throws> IOException this can't happen... </throws>
+ public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
+ {
+ TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
+ Lucene.Net.Analysis.Token t;
+ BooleanQuery tmp = new BooleanQuery();
+ System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
+ while ((t = ts.Next()) != null)
+ {
+ System.String word = t.TermText();
+ // ignore opt stop words
+ if (stop != null && stop.Contains(word))
+ continue;
+ // ignore dups
+ if (already.Contains(word) == true)
+ continue;
+ already.Add(word, word);
+ // add to query
+ TermQuery tq = new TermQuery(new Term(field, word));
+ try
+ {
+ tmp.Add(tq, BooleanClause.Occur.SHOULD); //false, false);
+ }
+ catch (BooleanQuery.TooManyClauses too)
+ {
+ // fail-safe, just return what we have, not the end of the world
+ break;
+ }
+ }
+ return tmp;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj Sat Mar 10 11:26:32 2007
@@ -0,0 +1,139 @@
+<VisualStudioProject>
+ <CSHARP
+ ProjectType = "Local"
+ ProductVersion = "7.10.3077"
+ SchemaVersion = "2.0"
+ ProjectGuid = "{B9C12C92-199A-426E-B892-3531E2600217}"
+ >
+ <Build>
+ <Settings
+ ApplicationIcon = ""
+ AssemblyKeyContainerName = ""
+ AssemblyName = "SimilarityNet"
+ AssemblyOriginatorKeyFile = ""
+ DefaultClientScript = "JScript"
+ DefaultHTMLPageLayout = "Grid"
+ DefaultTargetSchema = "IE50"
+ DelaySign = "false"
+ OutputType = "Library"
+ PreBuildEvent = ""
+ PostBuildEvent = ""
+ RootNamespace = "SimilarityNet"
+ RunPostBuildEvent = "OnBuildSuccess"
+ StartupObject = ""
+ >
+ <Config
+ Name = "Debug"
+ AllowUnsafeBlocks = "false"
+ BaseAddress = "285212672"
+ CheckForOverflowUnderflow = "false"
+ ConfigurationOverrideFile = ""
+ DefineConstants = ""
+ DocumentationFile = ""
+ DebugSymbols = "true"
+ FileAlignment = "4096"
+ IncrementalBuild = "true"
+ NoStdLib = "false"
+ NoWarn = ""
+ Optimize = "false"
+ OutputPath = ".\"
+ RegisterForComInterop = "false"
+ RemoveIntegerChecks = "false"
+ TreatWarningsAsErrors = "false"
+ WarningLevel = "4"
+ />
+ <Config
+ Name = "Release"
+ AllowUnsafeBlocks = "false"
+ BaseAddress = "285212672"
+ CheckForOverflowUnderflow = "false"
+ ConfigurationOverrideFile = ""
+ DefineConstants = ""
+ DocumentationFile = ""
+ DebugSymbols = "true"
+ FileAlignment = "4096"
+ IncrementalBuild = "true"
+ NoStdLib = "false"
+ NoWarn = ""
+ Optimize = "false"
+ OutputPath = ".\"
+ RegisterForComInterop = "false"
+ RemoveIntegerChecks = "false"
+ TreatWarningsAsErrors = "false"
+ WarningLevel = "4"
+ />
+ </Settings>
+ <References>
+ <Reference
+ Name = "System"
+ AssemblyName = "System"
+ />
+ <Reference
+ Name = "System.Data"
+ AssemblyName = "System.Data"
+ />
+ <Reference
+ Name = "System.Drawing"
+ AssemblyName = "System.Drawing"
+ />
+ <Reference
+ Name = "System.Management"
+ AssemblyName = "System.Management"
+ />
+ <Reference
+ Name = "System.Windows.Forms"
+ AssemblyName = "System.Windows.Forms"
+ />
+ <Reference
+ Name = "System.Design"
+ AssemblyName = "System.Design"
+ />
+ <Reference
+ Name = "System.Xml"
+ AssemblyName = "System.Xml"
+ />
+ <Reference
+ Name = "mscorlib"
+ AssemblyName = "mscorlib"
+ />
+ <Reference
+ Name = "Lucene.Net"
+ AssemblyName = "Lucene.Net"
+ HintPath = "..\..\Lucene.Net-2.0.1-001.src\Lucene.Net\bin\Debug\Lucene.Net.dll"
+ />
+ </References>
+ </Build>
+ <Files>
+ <Include>
+ <File
+ RelPath = "AssemblyInfo.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "Build.xml"
+ BuildAction = "Content"
+ />
+ <File
+ RelPath = "README.txt"
+ BuildAction = "Content"
+ />
+ <File
+ RelPath = "Similar\MoreLikeThis.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ <File
+ RelPath = "Similar\Package.html"
+ BuildAction = "Content"
+ />
+ <File
+ RelPath = "Similar\SimilarityQueries.cs"
+ SubType = "Code"
+ BuildAction = "Compile"
+ />
+ </Include>
+ </Files>
+ </CSHARP>
+</VisualStudioProject>
+
Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln Sat Mar 10 11:26:32 2007
@@ -0,0 +1,24 @@
+Microsoft Visual Studio Solution File, Format Version 8.00
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Similarity.Net-2.0.0", "Similarity.Net-2.0.0.csproj", "{B9C12C92-199A-426E-B892-3531E2600217}"
+ ProjectSection(ProjectDependencies) = postProject
+ EndProjectSection
+EndProject
+Global
+ GlobalSection(DPCodeReviewSolutionGUID) = preSolution
+ DPCodeReviewSolutionGUID = {00000000-0000-0000-0000-000000000000}
+ EndGlobalSection
+ GlobalSection(SolutionConfiguration) = preSolution
+ Debug = Debug
+ Release = Release
+ EndGlobalSection
+ GlobalSection(ProjectConfiguration) = postSolution
+ {B9C12C92-199A-426E-B892-3531E2600217}.Debug.ActiveCfg = Debug|.NET
+ {B9C12C92-199A-426E-B892-3531E2600217}.Debug.Build.0 = Debug|.NET
+ {B9C12C92-199A-426E-B892-3531E2600217}.Release.ActiveCfg = Release|.NET
+ {B9C12C92-199A-426E-B892-3531E2600217}.Release.Build.0 = Release|.NET
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ EndGlobalSection
+ GlobalSection(ExtensibilityAddIns) = postSolution
+ EndGlobalSection
+EndGlobal