You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/09/19 16:19:47 UTC
[14/21] More work on Lucene.Net.Queries
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ba0f3c7d/src/Lucene.Net.Queries/Mlt/MoreLikeThis.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Queries/Mlt/MoreLikeThis.cs b/src/Lucene.Net.Queries/Mlt/MoreLikeThis.cs
index 1a25108..3e2dad5 100644
--- a/src/Lucene.Net.Queries/Mlt/MoreLikeThis.cs
+++ b/src/Lucene.Net.Queries/Mlt/MoreLikeThis.cs
@@ -1,981 +1,785 @@
-using System.Collections.Generic;
+// <summary>
+// Copyright 2004-2005 The Apache Software Foundation.///
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// </summary>
+
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
using System.Text;
-
-/// <summary>
-/// Copyright 2004-2005 The Apache Software Foundation.
-///
-/// Licensed under the Apache License, Version 2.0 (the "License");
-/// you may not use this file except in compliance with the License.
-/// You may obtain a copy of the License at
-///
-/// http://www.apache.org/licenses/LICENSE-2.0
-///
-/// Unless required by applicable law or agreed to in writing, software
-/// distributed under the License is distributed on an "AS IS" BASIS,
-/// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-/// See the License for the specific language governing permissions and
-/// limitations under the License.
-/// </summary>
-namespace org.apache.lucene.queries.mlt
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Search.Similarities;
+using Lucene.Net.Util;
+using Reader = System.IO.TextReader;
+
+namespace Lucene.Net.Queries.Mlt
{
-
-
- using Analyzer = org.apache.lucene.analysis.Analyzer;
- using TokenStream = org.apache.lucene.analysis.TokenStream;
- using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- using Document = org.apache.lucene.document.Document;
- using Fields = org.apache.lucene.index.Fields;
- using IndexReader = org.apache.lucene.index.IndexReader;
- using IndexableField = org.apache.lucene.index.IndexableField;
- using MultiFields = org.apache.lucene.index.MultiFields;
- using Term = org.apache.lucene.index.Term;
- using Terms = org.apache.lucene.index.Terms;
- using TermsEnum = org.apache.lucene.index.TermsEnum;
- using org.apache.lucene.search;
- using DefaultSimilarity = org.apache.lucene.search.similarities.DefaultSimilarity;
- using TFIDFSimilarity = org.apache.lucene.search.similarities.TFIDFSimilarity;
- using BytesRef = org.apache.lucene.util.BytesRef;
- using CharsRef = org.apache.lucene.util.CharsRef;
- using IOUtils = org.apache.lucene.util.IOUtils;
- using PriorityQueue = org.apache.lucene.util.PriorityQueue;
- using UnicodeUtil = org.apache.lucene.util.UnicodeUtil;
-
-
- /// <summary>
- /// Generate "more like this" similarity queries.
- /// Based on this mail:
- /// <code><pre>
- /// Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
- /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
- /// is usually fast enough. But looking up the docFreq() of every term in the document is
- /// probably too slow.
- /// <p/>
- /// You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
- /// or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
- /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
- /// reduce the number of terms under consideration. Another heuristic is that terms with a
- /// high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
- /// number of characters, not selecting anything less than, e.g., six or seven characters.
- /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
- /// that do a pretty good job of characterizing a document.
- /// <p/>
- /// It all depends on what you're trying to do. If you're trying to eek out that last percent
- /// of precision and recall regardless of computational difficulty so that you can win a TREC
- /// competition, then the techniques I mention above are useless. But if you're trying to
- /// provide a "more like this" button on a search results page that does a decent job and has
- /// good performance, such techniques might be useful.
- /// <p/>
- /// An efficient, effective "more-like-this" query generator would be a great contribution, if
- /// anyone's interested. I'd imagine that it would take a Reader or a String (the document's
- /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
- /// above. The frequency and length thresholds could be parameters, etc.
- /// <p/>
- /// Doug
- /// </pre></code>
- /// <p/>
- /// <p/>
- /// <p/>
- /// <h3>Initial Usage</h3>
- /// <p/>
- /// This class has lots of options to try to make it efficient and flexible.
- /// The simplest possible usage is as follows. The bold
- /// fragment is specific to this class.
- /// <p/>
- /// <pre class="prettyprint">
- /// <p/>
- /// IndexReader ir = ...
- /// IndexSearcher is = ...
- /// <p/>
- /// MoreLikeThis mlt = new MoreLikeThis(ir);
- /// Reader target = ... // orig source of doc you want to find similarities to
- /// Query query = mlt.like( target);
- /// <p/>
- /// Hits hits = is.search(query);
- /// // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
- /// //you ignore the doc if it matches your 'target' document, as it should be similar to itself
- /// <p/>
- /// </pre>
- /// <p/>
- /// Thus you:
- /// <ol>
- /// <li> do your normal, Lucene setup for searching,
- /// <li> create a MoreLikeThis,
- /// <li> get the text of the doc you want to find similarities to
- /// <li> then call one of the like() calls to generate a similarity query
- /// <li> call the searcher to find the similar docs
- /// </ol>
- /// <p/>
- /// <h3>More Advanced Usage</h3>
- /// <p/>
- /// You may want to use <seealso cref="#setFieldNames setFieldNames(...)"/> so you can examine
- /// multiple fields (e.g. body and title) for similarity.
- /// <p/>
- /// <p/>
- /// Depending on the size of your index and the size and makeup of your documents you
- /// may want to call the other set methods to control how the similarity queries are
- /// generated:
- /// <ul>
- /// <li> <seealso cref="#setMinTermFreq setMinTermFreq(...)"/>
- /// <li> <seealso cref="#setMinDocFreq setMinDocFreq(...)"/>
- /// <li> <seealso cref="#setMaxDocFreq setMaxDocFreq(...)"/>
- /// <li> <seealso cref="#setMaxDocFreqPct setMaxDocFreqPct(...)"/>
- /// <li> <seealso cref="#setMinWordLen setMinWordLen(...)"/>
- /// <li> <seealso cref="#setMaxWordLen setMaxWordLen(...)"/>
- /// <li> <seealso cref="#setMaxQueryTerms setMaxQueryTerms(...)"/>
- /// <li> <seealso cref="#setMaxNumTokensParsed setMaxNumTokensParsed(...)"/>
- /// <li> <seealso cref="#setStopWords setStopWord(...)"/>
- /// </ul>
- /// <p/>
- /// <hr>
- /// <pre>
- /// Changes: Mark Harwood 29/02/04
- /// Some bugfixing, some refactoring, some optimisation.
- /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
- /// - bugfix: No significant terms being created for fields with a termvector - because
- /// was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
- /// - refactor: moved common code into isNoiseWord()
- /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
- /// </pre>
- /// </summary>
- public sealed class MoreLikeThis
- {
-
- /// <summary>
- /// Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
- /// </summary>
- /// <seealso cref= #getMaxNumTokensParsed </seealso>
- public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
-
- /// <summary>
- /// Ignore terms with less than this frequency in the source doc.
- /// </summary>
- /// <seealso cref= #getMinTermFreq </seealso>
- /// <seealso cref= #setMinTermFreq </seealso>
- public const int DEFAULT_MIN_TERM_FREQ = 2;
-
- /// <summary>
- /// Ignore words which do not occur in at least this many docs.
- /// </summary>
- /// <seealso cref= #getMinDocFreq </seealso>
- /// <seealso cref= #setMinDocFreq </seealso>
- public const int DEFAULT_MIN_DOC_FREQ = 5;
-
- /// <summary>
- /// Ignore words which occur in more than this many docs.
- /// </summary>
- /// <seealso cref= #getMaxDocFreq </seealso>
- /// <seealso cref= #setMaxDocFreq </seealso>
- /// <seealso cref= #setMaxDocFreqPct </seealso>
- public static readonly int DEFAULT_MAX_DOC_FREQ = int.MaxValue;
-
- /// <summary>
- /// Boost terms in query based on score.
- /// </summary>
- /// <seealso cref= #isBoost </seealso>
- /// <seealso cref= #setBoost </seealso>
- public const bool DEFAULT_BOOST = false;
-
- /// <summary>
- /// Default field names. Null is used to specify that the field names should be looked
- /// up at runtime from the provided reader.
- /// </summary>
- public static readonly string[] DEFAULT_FIELD_NAMES = new string[]{"contents"};
-
- /// <summary>
- /// Ignore words less than this length or if 0 then this has no effect.
- /// </summary>
- /// <seealso cref= #getMinWordLen </seealso>
- /// <seealso cref= #setMinWordLen </seealso>
- public const int DEFAULT_MIN_WORD_LENGTH = 0;
-
- /// <summary>
- /// Ignore words greater than this length or if 0 then this has no effect.
- /// </summary>
- /// <seealso cref= #getMaxWordLen </seealso>
- /// <seealso cref= #setMaxWordLen </seealso>
- public const int DEFAULT_MAX_WORD_LENGTH = 0;
-
- /// <summary>
- /// Default set of stopwords.
- /// If null means to allow stop words.
- /// </summary>
- /// <seealso cref= #setStopWords </seealso>
- /// <seealso cref= #getStopWords </seealso>
-//JAVA TO C# CONVERTER TODO TASK: Java wildcard generics are not converted to .NET:
-//ORIGINAL LINE: public static final Set<?> DEFAULT_STOP_WORDS = null;
- public const HashSet<?> DEFAULT_STOP_WORDS = null;
-
- /// <summary>
- /// Current set of stop words.
- /// </summary>
-//JAVA TO C# CONVERTER TODO TASK: Java wildcard generics are not converted to .NET:
-//ORIGINAL LINE: private Set<?> stopWords = DEFAULT_STOP_WORDS;
- private HashSet<?> stopWords = DEFAULT_STOP_WORDS;
-
- /// <summary>
- /// Return a Query with no more than this many terms.
- /// </summary>
- /// <seealso cref= BooleanQuery#getMaxClauseCount </seealso>
- /// <seealso cref= #getMaxQueryTerms </seealso>
- /// <seealso cref= #setMaxQueryTerms </seealso>
- public const int DEFAULT_MAX_QUERY_TERMS = 25;
-
- /// <summary>
- /// Analyzer that will be used to parse the doc.
- /// </summary>
- private Analyzer analyzer = null;
-
- /// <summary>
- /// Ignore words less frequent that this.
- /// </summary>
- private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
-
- /// <summary>
- /// Ignore words which do not occur in at least this many docs.
- /// </summary>
- private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
-
- /// <summary>
- /// Ignore words which occur in more than this many docs.
- /// </summary>
- private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
-
- /// <summary>
- /// Should we apply a boost to the Query based on the scores?
- /// </summary>
- private bool boost = DEFAULT_BOOST;
-
- /// <summary>
- /// Field name we'll analyze.
- /// </summary>
- private string[] fieldNames = DEFAULT_FIELD_NAMES;
-
- /// <summary>
- /// The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
- /// </summary>
- private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
-
- /// <summary>
- /// Ignore words if less than this len.
- /// </summary>
- private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
-
- /// <summary>
- /// Ignore words if greater than this len.
- /// </summary>
- private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
-
- /// <summary>
- /// Don't return a query longer than this.
- /// </summary>
- private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
-
- /// <summary>
- /// For idf() calculations.
- /// </summary>
- private TFIDFSimilarity similarity; // = new DefaultSimilarity();
-
- /// <summary>
- /// IndexReader to use
- /// </summary>
- private readonly IndexReader ir;
-
- /// <summary>
- /// Boost factor to use when boosting the terms
- /// </summary>
- private float boostFactor = 1;
-
- /// <summary>
- /// Returns the boost factor used when boosting terms
- /// </summary>
- /// <returns> the boost factor used when boosting terms </returns>
- /// <seealso cref= #setBoostFactor(float) </seealso>
- public float BoostFactor
- {
- get
- {
- return boostFactor;
- }
- set
- {
- this.boostFactor = value;
- }
- }
-
-
- /// <summary>
- /// Constructor requiring an IndexReader.
- /// </summary>
- public MoreLikeThis(IndexReader ir) : this(ir, new DefaultSimilarity())
- {
- }
-
- public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim)
- {
- this.ir = ir;
- this.similarity = sim;
- }
-
-
- public TFIDFSimilarity Similarity
- {
- get
- {
- return similarity;
- }
- set
- {
- this.similarity = value;
- }
- }
-
-
- /// <summary>
- /// Returns an analyzer that will be used to parse source doc with. The default analyzer
- /// is not set.
- /// </summary>
- /// <returns> the analyzer that will be used to parse source doc with. </returns>
- public Analyzer Analyzer
- {
- get
- {
- return analyzer;
- }
- set
- {
- this.analyzer = value;
- }
- }
-
-
- /// <summary>
- /// Returns the frequency below which terms will be ignored in the source doc. The default
- /// frequency is the <seealso cref="#DEFAULT_MIN_TERM_FREQ"/>.
- /// </summary>
- /// <returns> the frequency below which terms will be ignored in the source doc. </returns>
- public int MinTermFreq
- {
- get
- {
- return minTermFreq;
- }
- set
- {
- this.minTermFreq = value;
- }
- }
-
-
- /// <summary>
- /// Returns the frequency at which words will be ignored which do not occur in at least this
- /// many docs. The default frequency is <seealso cref="#DEFAULT_MIN_DOC_FREQ"/>.
- /// </summary>
- /// <returns> the frequency at which words will be ignored which do not occur in at least this
- /// many docs. </returns>
- public int MinDocFreq
- {
- get
- {
- return minDocFreq;
- }
- set
- {
- this.minDocFreq = value;
- }
- }
-
-
- /// <summary>
- /// Returns the maximum frequency in which words may still appear.
- /// Words that appear in more than this many docs will be ignored. The default frequency is
- /// <seealso cref="#DEFAULT_MAX_DOC_FREQ"/>.
- /// </summary>
- /// <returns> get the maximum frequency at which words are still allowed,
- /// words which occur in more docs than this are ignored. </returns>
- public int MaxDocFreq
- {
- get
- {
- return maxDocFreq;
- }
- set
- {
- this.maxDocFreq = value;
- }
- }
-
-
- /// <summary>
- /// Set the maximum percentage in which words may still appear. Words that appear
- /// in more than this many percent of all docs will be ignored.
- /// </summary>
- /// <param name="maxPercentage"> the maximum percentage of documents (0-100) that a term may appear
- /// in to be still considered relevant </param>
- public int MaxDocFreqPct
- {
- set
- {
- this.maxDocFreq = value * ir.numDocs() / 100;
- }
- }
-
-
- /// <summary>
- /// Returns whether to boost terms in query based on "score" or not. The default is
- /// <seealso cref="#DEFAULT_BOOST"/>.
- /// </summary>
- /// <returns> whether to boost terms in query based on "score" or not. </returns>
- /// <seealso cref= #setBoost </seealso>
- public bool Boost
- {
- get
- {
- return boost;
- }
- set
- {
- this.boost = value;
- }
- }
-
-
- /// <summary>
- /// Returns the field names that will be used when generating the 'More Like This' query.
- /// The default field names that will be used is <seealso cref="#DEFAULT_FIELD_NAMES"/>.
- /// </summary>
- /// <returns> the field names that will be used when generating the 'More Like This' query. </returns>
- public string[] FieldNames
- {
- get
- {
- return fieldNames;
- }
- set
- {
- this.fieldNames = value;
- }
- }
-
-
- /// <summary>
- /// Returns the minimum word length below which words will be ignored. Set this to 0 for no
- /// minimum word length. The default is <seealso cref="#DEFAULT_MIN_WORD_LENGTH"/>.
- /// </summary>
- /// <returns> the minimum word length below which words will be ignored. </returns>
- public int MinWordLen
- {
- get
- {
- return minWordLen;
- }
- set
- {
- this.minWordLen = value;
- }
- }
-
-
- /// <summary>
- /// Returns the maximum word length above which words will be ignored. Set this to 0 for no
- /// maximum word length. The default is <seealso cref="#DEFAULT_MAX_WORD_LENGTH"/>.
- /// </summary>
- /// <returns> the maximum word length above which words will be ignored. </returns>
- public int MaxWordLen
- {
- get
- {
- return maxWordLen;
- }
- set
- {
- this.maxWordLen = value;
- }
- }
-
-
- /// <summary>
- /// Set the set of stopwords.
- /// Any word in this set is considered "uninteresting" and ignored.
- /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
- /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
- /// </summary>
- /// <param name="stopWords"> set of stopwords, if null it means to allow stop words </param>
- /// <seealso cref= #getStopWords </seealso>
- public HashSet<T1> StopWords<T1>
- {
- set
- {
- this.stopWords = value;
- }
- get
- {
- return stopWords;
- }
- }
-
-
-
- /// <summary>
- /// Returns the maximum number of query terms that will be included in any generated query.
- /// The default is <seealso cref="#DEFAULT_MAX_QUERY_TERMS"/>.
- /// </summary>
- /// <returns> the maximum number of query terms that will be included in any generated query. </returns>
- public int MaxQueryTerms
- {
- get
- {
- return maxQueryTerms;
- }
- set
- {
- this.maxQueryTerms = value;
- }
- }
-
-
- /// <returns> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support </returns>
- /// <seealso cref= #DEFAULT_MAX_NUM_TOKENS_PARSED </seealso>
- public int MaxNumTokensParsed
- {
- get
- {
- return maxNumTokensParsed;
- }
- set
- {
- maxNumTokensParsed = value;
- }
- }
-
-
-
- /// <summary>
- /// Return a query that will return docs like the passed lucene document ID.
- /// </summary>
- /// <param name="docNum"> the documentID of the lucene doc to generate the 'More Like This" query for. </param>
- /// <returns> a query that will return docs like the passed lucene document ID. </returns>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public Query like(int docNum) throws IOException
- public Query like(int docNum)
- {
- if (fieldNames == null)
- {
- // gather list of valid fields from lucene
- ICollection<string> fields = MultiFields.getIndexedFields(ir);
- fieldNames = fields.toArray(new string[fields.Count]);
- }
-
- return createQuery(retrieveTerms(docNum));
- }
-
- /// <summary>
- /// Return a query that will return docs like the passed Reader.
- /// </summary>
- /// <returns> a query that will return docs like the passed Reader. </returns>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public Query like(Reader r, String fieldName) throws IOException
- public Query like(Reader r, string fieldName)
- {
- return createQuery(retrieveTerms(r, fieldName));
- }
-
- /// <summary>
- /// Create the More like query from a PriorityQueue
- /// </summary>
- private Query createQuery(PriorityQueue<object[]> q)
- {
- BooleanQuery query = new BooleanQuery();
- object cur;
- int qterms = 0;
- float bestScore = 0;
-
- while ((cur = q.pop()) != null)
- {
- object[] ar = (object[]) cur;
- TermQuery tq = new TermQuery(new Term((string) ar[1], (string) ar[0]));
-
- if (boost)
- {
- if (qterms == 0)
- {
- bestScore = ((float?) ar[2]);
- }
- float myScore = ((float?) ar[2]);
-
- tq.Boost = boostFactor * myScore / bestScore;
- }
-
- try
- {
- query.add(tq, BooleanClause.Occur.SHOULD);
- }
- catch (BooleanQuery.TooManyClauses)
- {
- break;
- }
-
- qterms++;
- if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
- {
- break;
- }
- }
-
- return query;
- }
-
- /// <summary>
- /// Create a PriorityQueue from a word->tf map.
- /// </summary>
- /// <param name="words"> a map of words keyed on the word(String) with Int objects as the values. </param>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private org.apache.lucene.util.PriorityQueue<Object[]> createQueue(Map<String, Int> words) throws IOException
- private PriorityQueue<object[]> createQueue(IDictionary<string, Int> words)
- {
- // have collected all words in doc and their freqs
- int numDocs = ir.numDocs();
- FreqQ res = new FreqQ(words.Count); // will order words by score
-
- foreach (string word in words.Keys) // for every word
- {
- int tf = words[word].x; // term freq in the source doc
- if (minTermFreq > 0 && tf < minTermFreq)
- {
- continue; // filter out words that don't occur enough times in the source
- }
-
- // go through all the fields and find the largest document frequency
- string topField = fieldNames[0];
- int docFreq = 0;
- foreach (string fieldName in fieldNames)
- {
- int freq = ir.docFreq(new Term(fieldName, word));
- topField = (freq > docFreq) ? fieldName : topField;
- docFreq = (freq > docFreq) ? freq : docFreq;
- }
-
- if (minDocFreq > 0 && docFreq < minDocFreq)
- {
- continue; // filter out words that don't occur in enough docs
- }
-
- if (docFreq > maxDocFreq)
- {
- continue; // filter out words that occur in too many docs
- }
-
- if (docFreq == 0)
- {
- continue; // index update problem?
- }
-
- float idf = similarity.idf(docFreq, numDocs);
- float score = tf * idf;
-
- // only really need 1st 3 entries, other ones are for troubleshooting
- res.insertWithOverflow(new object[]{word, topField, score, idf, docFreq, tf}); // freq in all docs - idf - overall score - the top field - the word
- }
- return res;
- }
-
- /// <summary>
- /// Describe the parameters that control how the "more like this" query is formed.
- /// </summary>
- public string describeParams()
- {
- StringBuilder sb = new StringBuilder();
- sb.Append("\t").Append("maxQueryTerms : ").Append(maxQueryTerms).Append("\n");
- sb.Append("\t").Append("minWordLen : ").Append(minWordLen).Append("\n");
- sb.Append("\t").Append("maxWordLen : ").Append(maxWordLen).Append("\n");
- sb.Append("\t").Append("fieldNames : ");
- string delim = "";
- foreach (string fieldName in fieldNames)
- {
- sb.Append(delim).Append(fieldName);
- delim = ", ";
- }
- sb.Append("\n");
- sb.Append("\t").Append("boost : ").Append(boost).Append("\n");
- sb.Append("\t").Append("minTermFreq : ").Append(minTermFreq).Append("\n");
- sb.Append("\t").Append("minDocFreq : ").Append(minDocFreq).Append("\n");
- return sb.ToString();
- }
-
- /// <summary>
- /// Find words for a more-like-this query former.
- /// </summary>
- /// <param name="docNum"> the id of the lucene document from which to find terms </param>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public org.apache.lucene.util.PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException
- public PriorityQueue<object[]> retrieveTerms(int docNum)
- {
- IDictionary<string, Int> termFreqMap = new Dictionary<string, Int>();
- foreach (string fieldName in fieldNames)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.index.Fields vectors = ir.getTermVectors(docNum);
- Fields vectors = ir.getTermVectors(docNum);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.index.Terms vector;
- Terms vector;
- if (vectors != null)
- {
- vector = vectors.terms(fieldName);
- }
- else
- {
- vector = null;
- }
-
- // field does not store term vector info
- if (vector == null)
- {
- Document d = ir.document(docNum);
- IndexableField[] fields = d.getFields(fieldName);
- foreach (IndexableField field in fields)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String stringValue = field.stringValue();
- string stringValue = field.stringValue();
- if (stringValue != null)
- {
- addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
- }
- }
- }
- else
- {
- addTermFrequencies(termFreqMap, vector);
- }
- }
-
- return createQueue(termFreqMap);
- }
-
- /// <summary>
- /// Adds terms and frequencies found in vector into the Map termFreqMap
- /// </summary>
- /// <param name="termFreqMap"> a Map of terms and their frequencies </param>
- /// <param name="vector"> List of terms and their frequencies for a doc/field </param>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void addTermFrequencies(Map<String, Int> termFreqMap, org.apache.lucene.index.Terms vector) throws IOException
- private void addTermFrequencies(IDictionary<string, Int> termFreqMap, Terms vector)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null);
- TermsEnum termsEnum = vector.iterator(null);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef();
- CharsRef spare = new CharsRef();
- BytesRef text;
- while ((text = termsEnum.next()) != null)
- {
- UnicodeUtil.UTF8toUTF16(text, spare);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String term = spare.toString();
- string term = spare.ToString();
- if (isNoiseWord(term))
- {
- continue;
- }
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq();
- int freq = (int) termsEnum.totalTermFreq();
-
- // increment frequency
- Int cnt = termFreqMap[term];
- if (cnt == null)
- {
- cnt = new Int();
- termFreqMap[term] = cnt;
- cnt.x = freq;
- }
- else
- {
- cnt.x += freq;
- }
- }
- }
-
- /// <summary>
- /// Adds term frequencies found by tokenizing text from reader into the Map words
- /// </summary>
- /// <param name="r"> a source of text to be tokenized </param>
- /// <param name="termFreqMap"> a Map of terms and their frequencies </param>
- /// <param name="fieldName"> Used by analyzer for any special per-field analysis </param>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException
- private void addTermFrequencies(Reader r, IDictionary<string, Int> termFreqMap, string fieldName)
- {
- if (analyzer == null)
- {
- throw new System.NotSupportedException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
- }
- TokenStream ts = analyzer.tokenStream(fieldName, r);
- try
- {
- int tokenCount = 0;
- // for every token
- CharTermAttribute termAtt = ts.addAttribute(typeof(CharTermAttribute));
- ts.reset();
- while (ts.incrementToken())
- {
- string word = termAtt.ToString();
- tokenCount++;
- if (tokenCount > maxNumTokensParsed)
- {
- break;
- }
- if (isNoiseWord(word))
- {
- continue;
- }
-
- // increment frequency
- Int cnt = termFreqMap[word];
- if (cnt == null)
- {
- termFreqMap[word] = new Int();
- }
- else
- {
- cnt.x++;
- }
- }
- ts.end();
- }
- finally
- {
- IOUtils.closeWhileHandlingException(ts);
- }
- }
-
-
- /// <summary>
- /// determines if the passed term is likely to be of interest in "more like" comparisons
- /// </summary>
- /// <param name="term"> The word being considered </param>
- /// <returns> true if should be ignored, false if should be used in further analysis </returns>
- private bool isNoiseWord(string term)
- {
- int len = term.Length;
- if (minWordLen > 0 && len < minWordLen)
- {
- return true;
- }
- if (maxWordLen > 0 && len > maxWordLen)
- {
- return true;
- }
- return stopWords != null && stopWords.Contains(term);
- }
-
-
- /// <summary>
- /// Find words for a more-like-this query former.
- /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
- /// Each array has 6 elements.
- /// The elements are:
- /// <ol>
- /// <li> The word (String)
- /// <li> The top field that this word comes from (String)
- /// <li> The score for this word (Float)
- /// <li> The IDF value (Float)
- /// <li> The frequency of this word in the index (Integer)
- /// <li> The frequency of this word in the source document (Integer)
- /// </ol>
- /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
- /// This method is exposed so that you can identify the "interesting words" in a document.
- /// For an easier method to call see <seealso cref="#retrieveInterestingTerms retrieveInterestingTerms()"/>.
- /// </summary>
- /// <param name="r"> the reader that has the content of the document </param>
- /// <param name="fieldName"> field passed to the analyzer to use when analyzing the content </param>
- /// <returns> the most interesting words in the document ordered by score, with the highest scoring, or best entry, first </returns>
- /// <seealso cref= #retrieveInterestingTerms </seealso>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public org.apache.lucene.util.PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException
- public PriorityQueue<object[]> retrieveTerms(Reader r, string fieldName)
- {
- IDictionary<string, Int> words = new Dictionary<string, Int>();
- addTermFrequencies(r, words, fieldName);
- return createQueue(words);
- }
-
- /// <seealso cref= #retrieveInterestingTerms(java.io.Reader, String) </seealso>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public String[] retrieveInterestingTerms(int docNum) throws IOException
- public string[] retrieveInterestingTerms(int docNum)
- {
- List<object> al = new List<object>(maxQueryTerms);
- PriorityQueue<object[]> pq = retrieveTerms(docNum);
- object cur;
- int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
- // we just want to return the top words
- while (((cur = pq.pop()) != null) && lim-- > 0)
- {
- object[] ar = (object[]) cur;
- al.Add(ar[0]); // the 1st entry is the interesting word
- }
- string[] res = new string[al.Count];
- return al.toArray(res);
- }
-
- /// <summary>
- /// Convenience routine to make it easy to return the most interesting words in a document.
- /// More advanced users will call <seealso cref="#retrieveTerms(Reader, String) retrieveTerms()"/> directly.
- /// </summary>
- /// <param name="r"> the source document </param>
- /// <param name="fieldName"> field passed to analyzer to use when analyzing the content </param>
- /// <returns> the most interesting words in the document </returns>
- /// <seealso cref= #retrieveTerms(java.io.Reader, String) </seealso>
- /// <seealso cref= #setMaxQueryTerms </seealso>
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException
- public string[] retrieveInterestingTerms(Reader r, string fieldName)
- {
- List<object> al = new List<object>(maxQueryTerms);
- PriorityQueue<object[]> pq = retrieveTerms(r, fieldName);
- object cur;
- int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
- // we just want to return the top words
- while (((cur = pq.pop()) != null) && lim-- > 0)
- {
- object[] ar = (object[]) cur;
- al.Add(ar[0]); // the 1st entry is the interesting word
- }
- string[] res = new string[al.Count];
- return al.toArray(res);
- }
-
- /// <summary>
- /// PriorityQueue that orders words by score.
- /// </summary>
- private class FreqQ : PriorityQueue<object[]>
- {
- internal FreqQ(int s) : base(s)
- {
- }
-
- protected internal override bool lessThan(object[] aa, object[] bb)
- {
- float? fa = (float?) aa[2];
- float? fb = (float?) bb[2];
- return fa > fb;
- }
- }
-
- /// <summary>
- /// Use for frequencies and to avoid renewing Integers.
- /// </summary>
- private class Int
- {
- internal int x;
-
- internal Int()
- {
- x = 1;
- }
- }
- }
+ /// <summary>
+ /// Generate "more like this" similarity queries.
+ /// Based on this mail:
+ /// <code><pre>
+ /// Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
+ /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
+ /// is usually fast enough. But looking up the docFreq() of every term in the document is
+ /// probably too slow.
+ /// <p/>
+ /// You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
+ /// or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
+ /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
+ /// reduce the number of terms under consideration. Another heuristic is that terms with a
+ /// high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
+ /// number of characters, not selecting anything less than, e.g., six or seven characters.
+ /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
+ /// that do a pretty good job of characterizing a document.
+ /// <p/>
+ /// It all depends on what you're trying to do. If you're trying to eek out that last percent
+ /// of precision and recall regardless of computational difficulty so that you can win a TREC
+ /// competition, then the techniques I mention above are useless. But if you're trying to
+ /// provide a "more like this" button on a search results page that does a decent job and has
+ /// good performance, such techniques might be useful.
+ /// <p/>
+ /// An efficient, effective "more-like-this" query generator would be a great contribution, if
+ /// anyone's interested. I'd imagine that it would take a Reader or a String (the document's
+ /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
+ /// above. The frequency and length thresholds could be parameters, etc.
+ /// <p/>
+ /// Doug
+ /// </pre></code>
+ /// <p/>
+ /// <p/>
+ /// <p/>
+ /// <h3>Initial Usage</h3>
+ /// <p/>
+ /// This class has lots of options to try to make it efficient and flexible.
+ /// The simplest possible usage is as follows. The bold
+ /// fragment is specific to this class.
+ /// <p/>
+ /// <pre class="prettyprint">
+ /// <p/>
+ /// IndexReader ir = ...
+ /// IndexSearcher is = ...
+ /// <p/>
+ /// MoreLikeThis mlt = new MoreLikeThis(ir);
+ /// Reader target = ... // orig source of doc you want to find similarities to
+ /// Query query = mlt.like( target);
+ /// <p/>
+ /// Hits hits = is.search(query);
+ /// // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
+ /// //you ignore the doc if it matches your 'target' document, as it should be similar to itself
+ /// <p/>
+ /// </pre>
+ /// <p/>
+ /// Thus you:
+ /// <ol>
+ /// <li> do your normal, Lucene setup for searching,
+ /// <li> create a MoreLikeThis,
+ /// <li> get the text of the doc you want to find similarities to
+ /// <li> then call one of the like() calls to generate a similarity query
+ /// <li> call the searcher to find the similar docs
+ /// </ol>
+ /// <p/>
+ /// <h3>More Advanced Usage</h3>
+ /// <p/>
+ /// You may want to use <seealso cref="#setFieldNames setFieldNames(...)"/> so you can examine
+ /// multiple fields (e.g. body and title) for similarity.
+ /// <p/>
+ /// <p/>
+ /// Depending on the size of your index and the size and makeup of your documents you
+ /// may want to call the other set methods to control how the similarity queries are
+ /// generated:
+ /// <ul>
+ /// <li> <seealso cref="#setMinTermFreq setMinTermFreq(...)"/>
+ /// <li> <seealso cref="#setMinDocFreq setMinDocFreq(...)"/>
+ /// <li> <seealso cref="#setMaxDocFreq setMaxDocFreq(...)"/>
+ /// <li> <seealso cref="#setMaxDocFreqPct setMaxDocFreqPct(...)"/>
+ /// <li> <seealso cref="#setMinWordLen setMinWordLen(...)"/>
+ /// <li> <seealso cref="#setMaxWordLen setMaxWordLen(...)"/>
+ /// <li> <seealso cref="#setMaxQueryTerms setMaxQueryTerms(...)"/>
+ /// <li> <seealso cref="#setMaxNumTokensParsed setMaxNumTokensParsed(...)"/>
+ /// <li> <seealso cref="#setStopWords setStopWord(...)"/>
+ /// </ul>
+ /// <p/>
+ /// <hr>
+ /// <pre>
+ /// Changes: Mark Harwood 29/02/04
+ /// Some bugfixing, some refactoring, some optimisation.
+ /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
+ /// - bugfix: No significant terms being created for fields with a termvector - because
+ /// was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
+ /// - refactor: moved common code into isNoiseWord()
+ /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
+ /// </pre>
+ /// </summary>
+ public sealed class MoreLikeThis
+ {
+
+ /// <summary>
+ /// Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
+ /// </summary>
+ /// <seealso cref= #getMaxNumTokensParsed </seealso>
+ public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
+
+ /// <summary>
+ /// Ignore terms with less than this frequency in the source doc.
+ /// </summary>
+ /// <seealso cref= #getMinTermFreq </seealso>
+ /// <seealso cref= #setMinTermFreq </seealso>
+ public const int DEFAULT_MIN_TERM_FREQ = 2;
+
+ /// <summary>
+ /// Ignore words which do not occur in at least this many docs.
+ /// </summary>
+ /// <seealso cref= #getMinDocFreq </seealso>
+ /// <seealso cref= #setMinDocFreq </seealso>
+ public const int DEFAULT_MIN_DOC_FREQ = 5;
+
+ /// <summary>
+ /// Ignore words which occur in more than this many docs.
+ /// </summary>
+ /// <seealso cref= #getMaxDocFreq </seealso>
+ /// <seealso cref= #setMaxDocFreq </seealso>
+ /// <seealso cref= #setMaxDocFreqPct </seealso>
+ public static readonly int DEFAULT_MAX_DOC_FREQ = int.MaxValue;
+
+ /// <summary>
+ /// Boost terms in query based on score.
+ /// </summary>
+ /// <seealso cref= #isBoost </seealso>
+ /// <seealso cref= #setBoost </seealso>
+ public const bool DEFAULT_BOOST = false;
+
+ /// <summary>
+ /// Default field names. Null is used to specify that the field names should be looked
+ /// up at runtime from the provided reader.
+ /// </summary>
+ public static readonly string[] DEFAULT_FIELD_NAMES = new string[] { "contents" };
+
+ /// <summary>
+ /// Ignore words less than this length or if 0 then this has no effect.
+ /// </summary>
+ /// <seealso cref= #getMinWordLen </seealso>
+ /// <seealso cref= #setMinWordLen </seealso>
+ public const int DEFAULT_MIN_WORD_LENGTH = 0;
+
+ /// <summary>
+ /// Ignore words greater than this length or if 0 then this has no effect.
+ /// </summary>
+ /// <seealso cref= #getMaxWordLen </seealso>
+ /// <seealso cref= #setMaxWordLen </seealso>
+ public const int DEFAULT_MAX_WORD_LENGTH = 0;
+
+ /// <summary>
+ /// Default set of stopwords.
+ /// If null means to allow stop words.
+ /// </summary>
+ /// <seealso cref= #setStopWords </seealso>
+ /// <seealso cref= #getStopWords </seealso>
+ public const ISet<string> DEFAULT_STOP_WORDS = null;
+
+ /// <summary>
+ /// Return a Query with no more than this many terms.
+ /// </summary>
+ /// <seealso cref= BooleanQuery#getMaxClauseCount </seealso>
+ /// <seealso cref= #getMaxQueryTerms </seealso>
+ /// <seealso cref= #setMaxQueryTerms </seealso>
+ public const int DEFAULT_MAX_QUERY_TERMS = 25;
+
+ /// <summary>
+ /// IndexReader to use
+ /// </summary>
+ private readonly IndexReader ir;
+
+ /// <summary>
+ /// Boost factor to use when boosting the terms
+ /// </summary>
+ private float boostFactor = 1;
+
+ /// <summary>
+ /// Returns the boost factor used when boosting terms
+ /// </summary>
+ /// <returns> the boost factor used when boosting terms </returns>
+ /// <seealso cref= #setBoostFactor(float) </seealso>
+ public float BoostFactor
+ {
+ get
+ {
+ return boostFactor;
+ }
+ set
+ {
+ this.boostFactor = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Constructor requiring an IndexReader.
+ /// </summary>
+ public MoreLikeThis(IndexReader ir)
+ : this(ir, new DefaultSimilarity())
+ {
+ }
+
+ public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim)
+ {
+ this.ir = ir;
+ this.Similarity = sim;
+ StopWords = DEFAULT_STOP_WORDS;
+
+ MinTermFreq = DEFAULT_MIN_TERM_FREQ;
+ MinDocFreq = DEFAULT_MIN_DOC_FREQ;
+ MaxDocFreq = DEFAULT_MAX_DOC_FREQ;
+ Boost = DEFAULT_BOOST;
+ FieldNames = DEFAULT_FIELD_NAMES;
+ MaxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
+ MinWordLen = DEFAULT_MIN_WORD_LENGTH;
+ MaxWordLen = DEFAULT_MAX_WORD_LENGTH;
+ MaxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
+ }
+
+
+ public TFIDFSimilarity Similarity { get; set; }
+
+
+ /// <summary>
+ /// Returns an analyzer that will be used to parse source doc with. The default analyzer
+ /// is not set.
+ /// </summary>
+ /// <returns> the analyzer that will be used to parse source doc with. </returns>
+ public Analyzer Analyzer { get; set; }
+
+
+ /// <summary>
+ /// Returns the frequency below which terms will be ignored in the source doc. The default
+ /// frequency is the <seealso cref="#DEFAULT_MIN_TERM_FREQ"/>.
+ /// </summary>
+ /// <returns> the frequency below which terms will be ignored in the source doc. </returns>
+ public int MinTermFreq { get; set; }
+
+
+ /// <summary>
+ /// Returns the frequency at which words will be ignored which do not occur in at least this
+ /// many docs. The default frequency is <seealso cref="#DEFAULT_MIN_DOC_FREQ"/>.
+ /// </summary>
+ /// <returns> the frequency at which words will be ignored which do not occur in at least this
+ /// many docs. </returns>
+ public int MinDocFreq { get; set; }
+
+
+ /// <summary>
+ /// Returns the maximum frequency in which words may still appear.
+ /// Words that appear in more than this many docs will be ignored. The default frequency is
+ /// <seealso cref="#DEFAULT_MAX_DOC_FREQ"/>.
+ /// </summary>
+ /// <returns> get the maximum frequency at which words are still allowed,
+ /// words which occur in more docs than this are ignored. </returns>
+ public int MaxDocFreq { get; set; }
+
+
+ /// <summary>
+ /// Set the maximum percentage in which words may still appear. Words that appear
+ /// in more than this many percent of all docs will be ignored.
+ /// </summary>
+ /// <param name="maxPercentage"> the maximum percentage of documents (0-100) that a term may appear
+ /// in to be still considered relevant </param>
+ public int MaxDocFreqPct
+ {
+ set
+ {
+ this.MaxDocFreq = value * ir.NumDocs / 100;
+ }
+ }
+
+
+ /// <summary>
+ /// Returns whether to boost terms in query based on "score" or not. The default is
+ /// <seealso cref="#DEFAULT_BOOST"/>.
+ /// </summary>
+ /// <returns> whether to boost terms in query based on "score" or not. </returns>
+ /// <seealso cref= #setBoost </seealso>
+ public bool Boost { get; set; }
+
+
+ /// <summary>
+ /// Returns the field names that will be used when generating the 'More Like This' query.
+ /// The default field names that will be used is <seealso cref="#DEFAULT_FIELD_NAMES"/>.
+ /// </summary>
+ /// <returns> the field names that will be used when generating the 'More Like This' query. </returns>
+ public string[] FieldNames { get; set; }
+
+
+ /// <summary>
+ /// Returns the minimum word length below which words will be ignored. Set this to 0 for no
+ /// minimum word length. The default is <seealso cref="#DEFAULT_MIN_WORD_LENGTH"/>.
+ /// </summary>
+ /// <returns> the minimum word length below which words will be ignored. </returns>
+ public int MinWordLen { get; set; }
+
+
+ /// <summary>
+ /// Returns the maximum word length above which words will be ignored. Set this to 0 for no
+ /// maximum word length. The default is <seealso cref="#DEFAULT_MAX_WORD_LENGTH"/>.
+ /// </summary>
+ /// <returns> the maximum word length above which words will be ignored. </returns>
+ public int MaxWordLen { get; set; }
+
+
+ /// <summary>
+ /// Set the set of stopwords.
+ /// Any word in this set is considered "uninteresting" and ignored.
+ /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
+ /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
+ /// </summary>
+ /// <param name="stopWords"> set of stopwords, if null it means to allow stop words </param>
+ /// <seealso cref= #getStopWords </seealso>
+ public ISet<string> StopWords { get; set; }
+
+ /// <summary>
+ /// Returns the maximum number of query terms that will be included in any generated query.
+ /// The default is <seealso cref="#DEFAULT_MAX_QUERY_TERMS"/>.
+ /// </summary>
+ /// <returns> the maximum number of query terms that will be included in any generated query. </returns>
+ public int MaxQueryTerms { get; set; }
+
+
+ /// <returns> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support </returns>
+ /// <seealso cref= #DEFAULT_MAX_NUM_TOKENS_PARSED </seealso>
+ public int MaxNumTokensParsed { get; set; }
+
+
+
+ /// <summary>
+ /// Return a query that will return docs like the passed lucene document ID.
+ /// </summary>
+ /// <param name="docNum"> the documentID of the lucene doc to generate the 'More Like This" query for. </param>
+ /// <returns> a query that will return docs like the passed lucene document ID. </returns>
+ public Query Like(int docNum)
+ {
+ if (FieldNames == null)
+ {
+ // gather list of valid fields from lucene
+ ICollection<string> fields = MultiFields.GetIndexedFields(ir);
+ FieldNames = fields.ToArray();
+ }
+
+ return CreateQuery(RetrieveTerms(docNum));
+ }
+
+ /// <summary>
+ /// Return a query that will return docs like the passed Reader.
+ /// </summary>
+ /// <returns> a query that will return docs like the passed Reader. </returns>
+ public Query Like(Reader r, string fieldName)
+ {
+ return CreateQuery(RetrieveTerms(r, fieldName));
+ }
+
+ /// <summary>
+ /// Create the More like query from a PriorityQueue
+ /// </summary>
+ private Query CreateQuery(PriorityQueue<object[]> q)
+ {
+ BooleanQuery query = new BooleanQuery();
+ object cur;
+ int qterms = 0;
+ float bestScore = 0;
+
+ while ((cur = q.Pop()) != null)
+ {
+ object[] ar = (object[])cur;
+ TermQuery tq = new TermQuery(new Term((string)ar[1], (string)ar[0]));
+
+ if (Boost)
+ {
+ if (qterms == 0)
+ {
+ bestScore = ((float?)ar[2]);
+ }
+ float myScore = ((float?)ar[2]);
+
+ tq.Boost = boostFactor * myScore / bestScore;
+ }
+
+ try
+ {
+ query.Add(tq, BooleanClause.Occur.SHOULD);
+ }
+ catch (BooleanQuery.TooManyClauses)
+ {
+ break;
+ }
+
+ qterms++;
+ if (MaxQueryTerms > 0 && qterms >= MaxQueryTerms)
+ {
+ break;
+ }
+ }
+
+ return query;
+ }
+
+ /// <summary>
+ /// Create a PriorityQueue from a word->tf map.
+ /// </summary>
+ /// <param name="words"> a map of words keyed on the word(String) with Int objects as the values. </param>
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: private org.apache.lucene.util.PriorityQueue<Object[]> createQueue(Map<String, Int> words) throws IOException
+ private PriorityQueue<object[]> createQueue(IDictionary<string, Int> words)
+ {
+ // have collected all words in doc and their freqs
+ int numDocs = ir.NumDocs;
+ FreqQ res = new FreqQ(words.Count); // will order words by score
+
+ foreach (string word in words.Keys) // for every word
+ {
+ int tf = words[word].x; // term freq in the source doc
+ if (MinTermFreq > 0 && tf < MinTermFreq)
+ {
+ continue; // filter out words that don't occur enough times in the source
+ }
+
+ // go through all the fields and find the largest document frequency
+ string topField = FieldNames[0];
+ int docFreq = 0;
+ foreach (string fieldName in FieldNames)
+ {
+ int freq = ir.DocFreq(new Term(fieldName, word));
+ topField = (freq > docFreq) ? fieldName : topField;
+ docFreq = (freq > docFreq) ? freq : docFreq;
+ }
+
+ if (MinDocFreq > 0 && docFreq < MinDocFreq)
+ {
+ continue; // filter out words that don't occur in enough docs
+ }
+
+ if (docFreq > MaxDocFreq)
+ {
+ continue; // filter out words that occur in too many docs
+ }
+
+ if (docFreq == 0)
+ {
+ continue; // index update problem?
+ }
+
+ float idf = Similarity.Idf(docFreq, numDocs);
+ float score = tf * idf;
+
+ // only really need 1st 3 entries, other ones are for troubleshooting
+ res.InsertWithOverflow(new object[] { word, topField, score, idf, docFreq, tf }); // freq in all docs - idf - overall score - the top field - the word
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// Describe the parameters that control how the "more like this" query is formed.
+ /// </summary>
+ public string describeParams()
+ {
+ StringBuilder sb = new StringBuilder();
+ sb.Append("\t").Append("maxQueryTerms : ").Append(MaxQueryTerms).Append("\n");
+ sb.Append("\t").Append("minWordLen : ").Append(MinWordLen).Append("\n");
+ sb.Append("\t").Append("maxWordLen : ").Append(MaxWordLen).Append("\n");
+ sb.Append("\t").Append("fieldNames : ");
+ string delim = "";
+ foreach (string fieldName in FieldNames)
+ {
+ sb.Append(delim).Append(fieldName);
+ delim = ", ";
+ }
+ sb.Append("\n");
+ sb.Append("\t").Append("boost : ").Append(Boost).Append("\n");
+ sb.Append("\t").Append("minTermFreq : ").Append(MinTermFreq).Append("\n");
+ sb.Append("\t").Append("minDocFreq : ").Append(MinDocFreq).Append("\n");
+ return sb.ToString();
+ }
+
+ /// <summary>
+ /// Find words for a more-like-this query former.
+ /// </summary>
+ /// <param name="docNum"> the id of the lucene document from which to find terms </param>
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: public org.apache.lucene.util.PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException
+ public PriorityQueue<object[]> RetrieveTerms(int docNum)
+ {
+ IDictionary<string, Int> termFreqMap = new Dictionary<string, Int>();
+ foreach (string fieldName in FieldNames)
+ {
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final org.apache.lucene.index.Fields vectors = ir.getTermVectors(docNum);
+ Fields vectors = ir.GetTermVectors(docNum);
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final org.apache.lucene.index.Terms vector;
+ Terms vector;
+ if (vectors != null)
+ {
+ vector = vectors.Terms(fieldName);
+ }
+ else
+ {
+ vector = null;
+ }
+
+ // field does not store term vector info
+ if (vector == null)
+ {
+ Document d = ir.Document(docNum);
+ IndexableField[] fields = d.GetFields(fieldName);
+ foreach (IndexableField field in fields)
+ {
+ string stringValue = field.StringValue;
+ if (stringValue != null)
+ {
+ AddTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
+ }
+ }
+ }
+ else
+ {
+ AddTermFrequencies(termFreqMap, vector);
+ }
+ }
+
+ return createQueue(termFreqMap);
+ }
+
+ /// <summary>
+ /// Adds terms and frequencies found in vector into the Map termFreqMap
+ /// </summary>
+ /// <param name="termFreqMap"> a Map of terms and their frequencies </param>
+ /// <param name="vector"> List of terms and their frequencies for a doc/field </param>
+ private void AddTermFrequencies(IDictionary<string, Int> termFreqMap, Terms vector)
+ {
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = vector.iterator(null);
+ TermsEnum termsEnum = vector.Iterator(null);
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final org.apache.lucene.util.CharsRef spare = new org.apache.lucene.util.CharsRef();
+ CharsRef spare = new CharsRef();
+ BytesRef text;
+ while ((text = termsEnum.Next()) != null)
+ {
+ UnicodeUtil.UTF8toUTF16(text, spare);
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final String term = spare.toString();
+ string term = spare.ToString();
+ if (IsNoiseWord(term))
+ {
+ continue;
+ }
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final int freq = (int) termsEnum.totalTermFreq();
+ int freq = (int)termsEnum.TotalTermFreq();
+
+ // increment frequency
+ Int cnt = termFreqMap[term];
+ if (cnt == null)
+ {
+ cnt = new Int();
+ termFreqMap[term] = cnt;
+ cnt.x = freq;
+ }
+ else
+ {
+ cnt.x += freq;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Adds term frequencies found by tokenizing text from reader into the Map words
+ /// </summary>
+ /// <param name="r"> a source of text to be tokenized </param>
+ /// <param name="termFreqMap"> a Map of terms and their frequencies </param>
+ /// <param name="fieldName"> Used by analyzer for any special per-field analysis </param>
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException
+ private void AddTermFrequencies(Reader r, IDictionary<string, Int> termFreqMap, string fieldName)
+ {
+ if (Analyzer == null)
+ {
+ throw new System.NotSupportedException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
+ }
+ TokenStream ts = Analyzer.TokenStream(fieldName, r);
+ try
+ {
+ int tokenCount = 0;
+ // for every token
+ var termAtt = ts.AddAttribute<CharTermAttribute>();
+ ts.Reset();
+ while (ts.IncrementToken())
+ {
+ string word = termAtt.ToString();
+ tokenCount++;
+ if (tokenCount > MaxNumTokensParsed)
+ {
+ break;
+ }
+ if (IsNoiseWord(word))
+ {
+ continue;
+ }
+
+ // increment frequency
+ Int cnt = termFreqMap[word];
+ if (cnt == null)
+ {
+ termFreqMap[word] = new Int();
+ }
+ else
+ {
+ cnt.x++;
+ }
+ }
+ ts.End();
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ts);
+ }
+ }
+
+
+ /// <summary>
+ /// determines if the passed term is likely to be of interest in "more like" comparisons
+ /// </summary>
+ /// <param name="term"> The word being considered </param>
+ /// <returns> true if should be ignored, false if should be used in further analysis </returns>
+ private bool IsNoiseWord(string term)
+ {
+ int len = term.Length;
+ if (MinWordLen > 0 && len < MinWordLen)
+ {
+ return true;
+ }
+ if (MaxWordLen > 0 && len > MaxWordLen)
+ {
+ return true;
+ }
+ return StopWords != null && StopWords.Contains(term);
+ }
+
+
+ /// <summary>
+ /// Find words for a more-like-this query former.
+ /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
+ /// Each array has 6 elements.
+ /// The elements are:
+ /// <ol>
+ /// <li> The word (String)
+ /// <li> The top field that this word comes from (String)
+ /// <li> The score for this word (Float)
+ /// <li> The IDF value (Float)
+ /// <li> The frequency of this word in the index (Integer)
+ /// <li> The frequency of this word in the source document (Integer)
+ /// </ol>
+ /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
+ /// This method is exposed so that you can identify the "interesting words" in a document.
+ /// For an easier method to call see <seealso cref="#retrieveInterestingTerms retrieveInterestingTerms()"/>.
+ /// </summary>
+ /// <param name="r"> the reader that has the content of the document </param>
+ /// <param name="fieldName"> field passed to the analyzer to use when analyzing the content </param>
+ /// <returns> the most interesting words in the document ordered by score, with the highest scoring, or best entry, first </returns>
+ /// <seealso cref= #retrieveInterestingTerms </seealso>
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: public org.apache.lucene.util.PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException
+ public PriorityQueue<object[]> RetrieveTerms(Reader r, string fieldName)
+ {
+ IDictionary<string, Int> words = new Dictionary<string, Int>();
+ AddTermFrequencies(r, words, fieldName);
+ return createQueue(words);
+ }
+
+ /// <seealso cref= #retrieveInterestingTerms(java.io.Reader, String) </seealso>
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: public String[] retrieveInterestingTerms(int docNum) throws IOException
+ public string[] RetrieveInterestingTerms(int docNum)
+ {
+ var al = new List<object>(MaxQueryTerms);
+ var pq = RetrieveTerms(docNum);
+ object cur;
+ int lim = MaxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
+ // we just want to return the top words
+ while (((cur = pq.Pop()) != null) && lim-- > 0)
+ {
+ object[] ar = (object[])cur;
+ al.Add(ar[0]); // the 1st entry is the interesting word
+ }
+ string[] res = new string[al.Count];
+ return al.ToArray(res);
+ }
+
+ /// <summary>
+ /// Convenience routine to make it easy to return the most interesting words in a document.
+ /// More advanced users will call <seealso cref="#retrieveTerms(Reader, String) retrieveTerms()"/> directly.
+ /// </summary>
+ /// <param name="r"> the source document </param>
+ /// <param name="fieldName"> field passed to analyzer to use when analyzing the content </param>
+ /// <returns> the most interesting words in the document </returns>
+ /// <seealso cref= #retrieveTerms(java.io.Reader, String) </seealso>
+ /// <seealso cref= #setMaxQueryTerms </seealso>
+ public string[] RetrieveInterestingTerms(Reader r, string fieldName)
+ {
+ List<object> al = new List<object>(MaxQueryTerms);
+ PriorityQueue<object[]> pq = RetrieveTerms(r, fieldName);
+ object cur;
+ int lim = MaxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
+ // we just want to return the top words
+ while (((cur = pq.Pop()) != null) && lim-- > 0)
+ {
+ object[] ar = (object[])cur;
+ al.Add(ar[0]); // the 1st entry is the interesting word
+ }
+ string[] res = new string[al.Count];
+ return al.ToArray(res);
+ }
+
+ /// <summary>
+ /// PriorityQueue that orders words by score.
+ /// </summary>
+ private class FreqQ : PriorityQueue<object[]>
+ {
+ internal FreqQ(int s)
+ : base(s)
+ {
+ }
+
+ public override bool LessThan(object[] aa, object[] bb)
+ {
+ float? fa = (float?)aa[2];
+ float? fb = (float?)bb[2];
+ return fa > fb;
+ }
+ }
+
+ /// <summary>
+ /// Use for frequencies and to avoid renewing Integers.
+ /// </summary>
+ private class Int
+ {
+ internal int x;
+
+ internal Int()
+ {
+ x = 1;
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ba0f3c7d/src/Lucene.Net.Queries/Mlt/MoreLikeThisQuery.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Queries/Mlt/MoreLikeThisQuery.cs b/src/Lucene.Net.Queries/Mlt/MoreLikeThisQuery.cs
index d397720..c9dcbe9 100644
--- a/src/Lucene.Net.Queries/Mlt/MoreLikeThisQuery.cs
+++ b/src/Lucene.Net.Queries/Mlt/MoreLikeThisQuery.cs
@@ -1,8 +1,9 @@
using System.Collections.Generic;
-
/*
* Created on 25-Jan-2006
*/
+using Lucene.Net.Queries.Mlt;
+
namespace org.apache.lucene.queries.mlt
{
@@ -206,7 +207,7 @@ namespace org.apache.lucene.queries.mlt
result = prime * result + minDocFreq;
result = prime * result + minTermFrequency;
result = prime * result + Arrays.GetHashCode(moreLikeFields);
- result = prime * result + float.floatToIntBits(percentTermsToMatch);
+ result = prime * result + Number.FloatToIntBits(percentTermsToMatch);
result = prime * result + ((stopWords == null) ? 0 : stopWords.GetHashCode());
return result;
}
@@ -275,7 +276,7 @@ namespace org.apache.lucene.queries.mlt
{
return false;
}
- if (float.floatToIntBits(percentTermsToMatch) != float.floatToIntBits(other.percentTermsToMatch))
+ if (Number.FloatToIntBits(percentTermsToMatch) != Number.FloatToIntBits(other.percentTermsToMatch))
{
return false;
}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ba0f3c7d/src/Lucene.Net.Queries/TermFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Queries/TermFilter.cs b/src/Lucene.Net.Queries/TermFilter.cs
index 1e4ffd0..d2580c7 100644
--- a/src/Lucene.Net.Queries/TermFilter.cs
+++ b/src/Lucene.Net.Queries/TermFilter.cs
@@ -1,139 +1,122 @@
-namespace org.apache.lucene.queries
-{
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- using AtomicReaderContext = org.apache.lucene.index.AtomicReaderContext;
- using DocsEnum = org.apache.lucene.index.DocsEnum;
- using Term = org.apache.lucene.index.Term;
- using Terms = org.apache.lucene.index.Terms;
- using TermsEnum = org.apache.lucene.index.TermsEnum;
- using DocIdSet = org.apache.lucene.search.DocIdSet;
- using DocIdSetIterator = org.apache.lucene.search.DocIdSetIterator;
- using Filter = org.apache.lucene.search.Filter;
- using Bits = org.apache.lucene.util.Bits;
-
- /// <summary>
- /// A filter that includes documents that match with a specific term.
- /// </summary>
- public sealed class TermFilter : Filter
- {
-
- private readonly Term term;
-
- /// <param name="term"> The term documents need to have in order to be a match for this filter. </param>
- public TermFilter(Term term)
- {
- if (term == null)
- {
- throw new System.ArgumentException("Term must not be null");
- }
- else if (term.field() == null)
- {
- throw new System.ArgumentException("Field must not be null");
- }
- this.term = term;
- }
-
- /// <returns> The term this filter includes documents with. </returns>
- public Term Term
- {
- get
- {
- return term;
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.search.DocIdSet getDocIdSet(org.apache.lucene.index.AtomicReaderContext context, final org.apache.lucene.util.Bits acceptDocs) throws java.io.IOException
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
- public override DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
- {
- Terms terms = context.reader().terms(term.field());
- if (terms == null)
- {
- return null;
- }
-
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = terms.iterator(null);
- TermsEnum termsEnum = terms.iterator(null);
- if (!termsEnum.seekExact(term.bytes()))
- {
- return null;
- }
- return new DocIdSetAnonymousInnerClassHelper(this, acceptDocs, termsEnum);
- }
-
- private class DocIdSetAnonymousInnerClassHelper : DocIdSet
- {
- private readonly TermFilter outerInstance;
-
- private Bits acceptDocs;
- private TermsEnum termsEnum;
-
- public DocIdSetAnonymousInnerClassHelper(TermFilter outerInstance, Bits acceptDocs, TermsEnum termsEnum)
- {
- this.outerInstance = outerInstance;
- this.acceptDocs = acceptDocs;
- this.termsEnum = termsEnum;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.search.DocIdSetIterator iterator() throws java.io.IOException
- public override DocIdSetIterator iterator()
- {
- return termsEnum.docs(acceptDocs, null, DocsEnum.FLAG_NONE);
- }
-
- }
-
- public override bool Equals(object o)
- {
- if (this == o)
- {
- return true;
- }
- if (o == null || this.GetType() != o.GetType())
- {
- return false;
- }
-
- TermFilter that = (TermFilter) o;
-
- if (term != null ?!term.Equals(that.term) : that.term != null)
- {
- return false;
- }
-
- return true;
- }
-
- public override int GetHashCode()
- {
- return term != null ? term.GetHashCode() : 0;
- }
-
- public override string ToString()
- {
- return term.field() + ":" + term.text();
- }
-
- }
+namespace Lucene.Net.Queries
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// A filter that includes documents that match with a specific term.
+ /// </summary>
+ public sealed class TermFilter : Filter
+ {
+
+ private readonly Term term;
+
+ /// <param name="term"> The term documents need to have in order to be a match for this filter. </param>
+ public TermFilter(Term term)
+ {
+ if (term == null)
+ {
+ throw new System.ArgumentException("Term must not be null");
+ }
+ else if (term.Field() == null)
+ {
+ throw new System.ArgumentException("Field must not be null");
+ }
+ this.term = term;
+ }
+
+ /// <returns> The term this filter includes documents with. </returns>
+ public Term Term
+ {
+ get
+ {
+ return term;
+ }
+ }
+
+ public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs)
+ {
+ Terms terms = context.AtomicReader.Terms(term.Field());
+ if (terms == null)
+ {
+ return null;
+ }
+
+ TermsEnum termsEnum = terms.Iterator(null);
+ if (!termsEnum.SeekExact(term.Bytes()))
+ {
+ return null;
+ }
+ return new DocIdSetAnonymousInnerClassHelper(this, acceptDocs, termsEnum);
+ }
+
+ private class DocIdSetAnonymousInnerClassHelper : DocIdSet
+ {
+ private readonly TermFilter outerInstance;
+
+ private Bits acceptDocs;
+ private TermsEnum termsEnum;
+
+ public DocIdSetAnonymousInnerClassHelper(TermFilter outerInstance, Bits acceptDocs, TermsEnum termsEnum)
+ {
+ this.outerInstance = outerInstance;
+ this.acceptDocs = acceptDocs;
+ this.termsEnum = termsEnum;
+ }
+
+ public override DocIdSetIterator GetIterator()
+ {
+ return termsEnum.Docs(acceptDocs, null, DocsEnum.FLAG_NONE);
+ }
+ }
+
+ public override bool Equals(object o)
+ {
+ if (this == o)
+ {
+ return true;
+ }
+ if (o == null || this.GetType() != o.GetType())
+ {
+ return false;
+ }
+
+ TermFilter that = (TermFilter)o;
+
+ if (term != null ? !term.Equals(that.term) : that.term != null)
+ {
+ return false;
+ }
+
+ return true;
+ }
+
+ public override int GetHashCode()
+ {
+ return term != null ? term.GetHashCode() : 0;
+ }
+
+ public override string ToString()
+ {
+ return term.Field() + ":" + term.Text();
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ba0f3c7d/src/Lucene.Net.Queries/TermsFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Queries/TermsFilter.cs b/src/Lucene.Net.Queries/TermsFilter.cs
index be418fb..667cd55 100644
--- a/src/Lucene.Net.Queries/TermsFilter.cs
+++ b/src/Lucene.Net.Queries/TermsFilter.cs
@@ -1,8 +1,13 @@
using System;
using System.Collections.Generic;
+using System.Linq;
using System.Text;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
-namespace org.apache.lucene.queries
+namespace Lucene.Net.Queries
{
/*
@@ -21,18 +26,7 @@ namespace org.apache.lucene.queries
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
- using org.apache.lucene.index;
- using DocIdSet = org.apache.lucene.search.DocIdSet;
- using DocIdSetIterator = org.apache.lucene.search.DocIdSetIterator;
- using Filter = org.apache.lucene.search.Filter;
- using ArrayUtil = org.apache.lucene.util.ArrayUtil;
- using Bits = org.apache.lucene.util.Bits;
- using BytesRef = org.apache.lucene.util.BytesRef;
- using FixedBitSet = org.apache.lucene.util.FixedBitSet;
-
-
- /// <summary>
+ /// <summary>
/// Constructs a filter for docs matching any of the terms added to this class.
/// Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
/// a sequence. An example might be a collection of primary keys from a database query result or perhaps
@@ -62,8 +56,6 @@ namespace org.apache.lucene.queries
/// Creates a new <seealso cref="TermsFilter"/> from the given list. The list
/// can contain duplicate terms and multiple fields.
/// </summary>
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
-//ORIGINAL LINE: public TermsFilter(final java.util.List<Term> terms)
public TermsFilter(IList<Term> terms) : this(new FieldAndTermEnumAnonymousInnerClassHelper(this, terms), terms.Count)
{
}
@@ -82,10 +74,10 @@ namespace org.apache.lucene.queries
}
// we need to sort for deduplication and to have a common cache key
- internal readonly IEnumerator<Term> iter;
- public override BytesRef next()
+ readonly IEnumerator<Term> iter;
+ public override BytesRef Next()
{
- if (iter.hasNext())
+ if (iter.HasNext())
{
Term next = iter.next();
field = next.field();
@@ -99,8 +91,6 @@ namespace org.apache.lucene.queries
/// Creates a new <seealso cref="TermsFilter"/> from the given <seealso cref="BytesRef"/> list for
/// a single field.
/// </summary>
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
-//ORIGINAL LINE: public TermsFilter(final String field, final java.util.List<org.apache.lucene.util.BytesRef> terms)
public TermsFilter(string field, IList<BytesRef> terms) : this(new FieldAndTermEnumAnonymousInnerClassHelper2(this, field, terms), terms.Count)
{
}
@@ -115,16 +105,16 @@ namespace org.apache.lucene.queries
{
this.outerInstance = outerInstance;
this.terms = terms;
- iter = sort(terms).GetEnumerator();
+ iter = Sort(terms).GetEnumerator();
}
// we need to sort for deduplication and to have a common cache key
- internal readonly IEnumerator<BytesRef> iter;
- public override BytesRef next()
+ readonly IEnumerator<BytesRef> iter;
+ public override BytesRef Next()
{
- if (iter.hasNext())
+ if (iter.HasNext())
{
- return iter.next();
+ return iter.Next();
}
return null;
}
@@ -134,9 +124,7 @@ namespace org.apache.lucene.queries
/// Creates a new <seealso cref="TermsFilter"/> from the given <seealso cref="BytesRef"/> array for
/// a single field.
/// </summary>
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
-//ORIGINAL LINE: public TermsFilter(final String field, final org.apache.lucene.util.BytesRef...terms)
- public TermsFilter(string field, params BytesRef[] terms) : this(field, Arrays.asList(terms))
+ public TermsFilter(string field, params BytesRef[] terms) : this(field, Arrays.AsList(terms))
{
// this ctor prevents unnecessary Term creations
}
@@ -145,9 +133,7 @@ namespace org.apache.lucene.queries
/// Creates a new <seealso cref="TermsFilter"/> from the given array. The array can
/// contain duplicate terms and multiple fields.
/// </summary>
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
-//ORIGINAL LINE: public TermsFilter(final Term... terms)
- public TermsFilter(params Term[] terms) : this(Arrays.asList(terms))
+ public TermsFilter(params Term[] terms) : this(terms.ToList())
{
}
@@ -175,7 +161,7 @@ namespace org.apache.lucene.queries
string previousField = null;
BytesRef currentTerm;
string currentField;
- while ((currentTerm = iter.next()) != null)
+ while ((currentTerm = iter.Next()) != null)
{
currentField = iter.field();
if (currentField == null)
@@ -187,15 +173,13 @@ namespace org.apache.lucene.queries
// deduplicate
if (previousField.Equals(currentField))
{
- if (previousTerm.bytesEquals(currentTerm))
+ if (previousTerm.BytesEquals(currentTerm))
{
continue;
}
}
else
{
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(start, index, previousField);
termsAndFields.Add(lastTermsAndField);
@@ -215,31 +199,20 @@ namespace org.apache.lucene.queries
previousField = currentField;
}
offsets[index] = lastEndOffset;
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(start, index, previousField);
termsAndFields.Add(lastTermsAndField);
- this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset);
+ this.termsBytes = ArrayUtil.Shrink(serializedTerms, lastEndOffset);
this.termsAndFields = termsAndFields.ToArray();
this.hashCode_Renamed = hash;
}
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.search.DocIdSet getDocIdSet(AtomicReaderContext context, org.apache.lucene.util.Bits acceptDocs) throws java.io.IOException
- public override DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
+ public override DocIdSet GetDocIdSet(AtomicReaderContext context, Bits acceptDocs)
{
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final AtomicReader reader = context.reader();
- AtomicReader reader = context.reader();
+ AtomicReader reader = context.AtomicReader;
FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final Fields fields = reader.fields();
- Fields fields = reader.fields();
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final org.apache.lucene.util.BytesRef spare = new org.apache.lucene.util.BytesRef(this.termsBytes);
+ Fields fields = reader.Fields;
BytesRef spare = new BytesRef(this.termsBytes);
if (fields == null)
{
@@ -250,7 +223,7 @@ namespace org.apache.lucene.queries
DocsEnum docs = null;
foreach (TermsAndField termsAndField in this.termsAndFields)
{
- if ((terms = fields.terms(termsAndField.field)) != null)
+ if ((terms = fields.Terms(termsAndField.field)) != null)
{
termsEnum = terms.iterator(termsEnum); // this won't return null
for (int i = termsAndField.start; i < termsAndField.end; i++)
@@ -321,15 +294,15 @@ namespace org.apache.lucene.queries
TermsAndField current = termsAndFields[i];
for (int j = current.start; j < current.end; j++)
{
- spare.offset = offsets[j];
- spare.length = offsets[j + 1] - offsets[j];
+ spare.Offset = offsets[j];
+ spare.Length = offsets[j + 1] - offsets[j];
if (!first)
{
builder.Append(' ');
}
first = false;
builder.Append(current.field).Append(':');
- builder.Append(spare.utf8ToString());
+ builder.Append(spare.Utf8ToString());
}
}
@@ -401,9 +374,9 @@ namespace org.apache.lucene.queries
private abstract class FieldAndTermEnum
{
- protected internal string field_Renamed;
+ protected internal string field;
- public abstract BytesRef next();
+ public abstract BytesRef Next();
public FieldAndTermEnum()
{
@@ -411,13 +384,13 @@ namespace org.apache.lucene.queries
public FieldAndTermEnum(string field)
{
- this.field_Renamed = field;
+ this.field = field;
}
- public virtual string field()
- {
- return field_Renamed;
- }
+ public virtual string Field
+ {
+ get { return field; }
+ }
}
/*
@@ -425,7 +398,7 @@ namespace org.apache.lucene.queries
*/
//JAVA TO C# CONVERTER TODO TASK: Java wildcard generics are not converted to .NET:
//ORIGINAL LINE: private static <T extends Comparable<? base T>> java.util.List<T> sort(java.util.List<T> toSort)
- private static IList<T> sort<T>(IList<T> toSort) where T : Comparable<? base T>
+ private static IList<T> Sort<T>(IList<T> toSort) where T : Comparable<? base T>
{
if (toSort.Count == 0)
{