You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by yo...@apache.org on 2008/11/11 03:35:52 UTC
svn commit: r712922 [3/9] - in /lucene/java/trunk: contrib/analyzers/src/java/org/apache/lucene/analysis/el/ contrib/analyzers/src/java/org/apache/lucene/analysis/fr/ contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/ contrib/analyzer...

Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java Mon Nov 10 18:35:46 2008
@@ -1,248 +1,248 @@
-package org.apache.lucene.search;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import java.io.IOException;
-import java.util.BitSet;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.util.OpenBitSet;
-
-public class DuplicateFilter extends Filter
-{
-	
-	String fieldName;
-	
-	/**
-	 * KeepMode determines which document id to consider as the master, all others being 
-	 * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
-	 */
-	int keepMode=KM_USE_FIRST_OCCURRENCE;
-	public static final int KM_USE_FIRST_OCCURRENCE=1;
-	public static final int KM_USE_LAST_OCCURRENCE=2;
-	
-	/**
-	 * "Full" processing mode starts by setting all bits to false and only setting bits
-	 * for documents that contain the given field and are identified as none-duplicates. 
-
-	 * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
-	 * given field. This approach avoids the need to read TermDocs for terms that are seen 
-	 * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially 
-	 * faster approach , the downside is that bitsets produced will include bits set for 
-	 * documents that do not actually contain the field given.
-	 * 
-	 */
-	int processingMode=PM_FULL_VALIDATION;
-	public static final int PM_FULL_VALIDATION=1;
-	public static final int PM_FAST_INVALIDATION=2;
-	
-
-	
-	public DuplicateFilter(String fieldName)
-	{
-		this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
-	}
-	
-
-	public DuplicateFilter(String fieldName, int keepMode, int processingMode)
-	{
-		this.fieldName = fieldName;
-		this.keepMode = keepMode;
-		this.processingMode = processingMode;
-	}
-
-  public DocIdSet getDocIdSet(IndexReader reader) throws IOException
-	{
-		if(processingMode==PM_FAST_INVALIDATION)
-		{
-			return fastBits(reader);
-		}
-		else
-		{
-			return correctBits(reader);
-		}
-	}
-	
-  private OpenBitSet correctBits(IndexReader reader) throws IOException
-	{
-		
-    OpenBitSet bits=new OpenBitSet(reader.maxDoc()); //assume all are INvalid
-		Term startTerm=new Term(fieldName);
-		TermEnum te = reader.terms(startTerm);
-		if(te!=null)
-		{
-			Term currTerm=te.term();
-			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
-			{
-				int lastDoc=-1;
-				//set non duplicates
-				TermDocs td = reader.termDocs(currTerm);
-				if(td.next())
-				{
-					if(keepMode==KM_USE_FIRST_OCCURRENCE)
-					{
-						bits.set(td.doc());
-					}
-					else
-					{
-						do
-						{
-							lastDoc=td.doc();
-						}while(td.next());
-						bits.set(lastDoc);
-					}
-				}
-				if(!te.next())
-				{
-					break;
-				}
-				currTerm=te.term();
-			}
-		}
-		return bits;
-	}
-	
-  private OpenBitSet fastBits(IndexReader reader) throws IOException
-	{
-		
-    OpenBitSet bits=new OpenBitSet(reader.maxDoc());
-		bits.set(0,reader.maxDoc()); //assume all are valid
-		Term startTerm=new Term(fieldName);
-		TermEnum te = reader.terms(startTerm);
-		if(te!=null)
-		{
-			Term currTerm=te.term();
-			
-			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
-			{
-				if(te.docFreq()>1)
-				{
-					int lastDoc=-1;
-					//unset potential duplicates
-					TermDocs td = reader.termDocs(currTerm);
-					td.next();
-					if(keepMode==KM_USE_FIRST_OCCURRENCE)
-					{
-						td.next();
-					}
-					do
-					{
-						lastDoc=td.doc();
-            bits.clear(lastDoc);
-					}while(td.next());
-					if(keepMode==KM_USE_LAST_OCCURRENCE)
-					{
-						//restore the last bit
-						bits.set(lastDoc);
-					}					
-				}
-				if(!te.next())
-				{
-					break;
-				}
-				currTerm=te.term();
-			}
-		}
-		return bits;
-	}
-
-	/**
-	 * @param args
-	 * @throws IOException 
-	 * @throws Exception 
-	 */
-	public static void main(String[] args) throws Exception
-	{
-		IndexReader r=IndexReader.open("/indexes/personCentricAnon");
-//		IndexReader r=IndexReader.open("/indexes/enron");
-		long start=System.currentTimeMillis();
-//		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
-//		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
-		DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
-//		DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
-//		df.setProcessingMode(PM_SLOW_VALIDATION);
-		BitSet b = df.bits(r);
-		long end=System.currentTimeMillis()-start;
-		System.out.println(b.cardinality()+" in "+end+" ms ");
-
-	}
-
-
-	public String getFieldName()
-	{
-		return fieldName;
-	}
-
-
-	public void setFieldName(String fieldName)
-	{
-		this.fieldName = fieldName;
-	}
-
-
-	public int getKeepMode()
-	{
-		return keepMode;
-	}
-
-
-	public void setKeepMode(int keepMode)
-	{
-		this.keepMode = keepMode;
-	}
-
-
-	public boolean equals(Object obj)
-	{
-		if(this == obj)
-			return true;
-		if((obj == null) || (obj.getClass() != this.getClass()))
-			return false;
-		DuplicateFilter other = (DuplicateFilter)obj;
-		return keepMode == other.keepMode &&
-		processingMode == other.processingMode &&
-			(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
-	}
-
-
-
-	public int hashCode()
-	{
-		int hash = 217;
-		hash = 31 * hash + keepMode;
-		hash = 31 * hash + processingMode;
-		hash = 31 * hash + fieldName.hashCode();
-		return hash;	
-	}
-
-
-	public int getProcessingMode()
-	{
-		return processingMode;
-	}
-
-
-	public void setProcessingMode(int processingMode)
-	{
-		this.processingMode = processingMode;
-	}
-	
-	
-
-}
+package org.apache.lucene.search;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.util.OpenBitSet;
+
+public class DuplicateFilter extends Filter
+{
+	
+	String fieldName;
+	
+	/**
+	 * KeepMode determines which document id to consider as the master, all others being 
+	 * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
+	 */
+	int keepMode=KM_USE_FIRST_OCCURRENCE;
+	public static final int KM_USE_FIRST_OCCURRENCE=1;
+	public static final int KM_USE_LAST_OCCURRENCE=2;
+	
+	/**
+	 * "Full" processing mode starts by setting all bits to false and only setting bits
+	 * for documents that contain the given field and are identified as none-duplicates. 
+
+	 * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
+	 * given field. This approach avoids the need to read TermDocs for terms that are seen 
+	 * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially 
+	 * faster approach , the downside is that bitsets produced will include bits set for 
+	 * documents that do not actually contain the field given.
+	 * 
+	 */
+	int processingMode=PM_FULL_VALIDATION;
+	public static final int PM_FULL_VALIDATION=1;
+	public static final int PM_FAST_INVALIDATION=2;
+	
+
+	
+	public DuplicateFilter(String fieldName)
+	{
+		this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
+	}
+	
+
+	public DuplicateFilter(String fieldName, int keepMode, int processingMode)
+	{
+		this.fieldName = fieldName;
+		this.keepMode = keepMode;
+		this.processingMode = processingMode;
+	}
+
+  public DocIdSet getDocIdSet(IndexReader reader) throws IOException
+	{
+		if(processingMode==PM_FAST_INVALIDATION)
+		{
+			return fastBits(reader);
+		}
+		else
+		{
+			return correctBits(reader);
+		}
+	}
+	
+  private OpenBitSet correctBits(IndexReader reader) throws IOException
+	{
+		
+    OpenBitSet bits=new OpenBitSet(reader.maxDoc()); //assume all are INvalid
+		Term startTerm=new Term(fieldName);
+		TermEnum te = reader.terms(startTerm);
+		if(te!=null)
+		{
+			Term currTerm=te.term();
+			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+			{
+				int lastDoc=-1;
+				//set non duplicates
+				TermDocs td = reader.termDocs(currTerm);
+				if(td.next())
+				{
+					if(keepMode==KM_USE_FIRST_OCCURRENCE)
+					{
+						bits.set(td.doc());
+					}
+					else
+					{
+						do
+						{
+							lastDoc=td.doc();
+						}while(td.next());
+						bits.set(lastDoc);
+					}
+				}
+				if(!te.next())
+				{
+					break;
+				}
+				currTerm=te.term();
+			}
+		}
+		return bits;
+	}
+	
+  private OpenBitSet fastBits(IndexReader reader) throws IOException
+	{
+		
+    OpenBitSet bits=new OpenBitSet(reader.maxDoc());
+		bits.set(0,reader.maxDoc()); //assume all are valid
+		Term startTerm=new Term(fieldName);
+		TermEnum te = reader.terms(startTerm);
+		if(te!=null)
+		{
+			Term currTerm=te.term();
+			
+			while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+			{
+				if(te.docFreq()>1)
+				{
+					int lastDoc=-1;
+					//unset potential duplicates
+					TermDocs td = reader.termDocs(currTerm);
+					td.next();
+					if(keepMode==KM_USE_FIRST_OCCURRENCE)
+					{
+						td.next();
+					}
+					do
+					{
+						lastDoc=td.doc();
+            bits.clear(lastDoc);
+					}while(td.next());
+					if(keepMode==KM_USE_LAST_OCCURRENCE)
+					{
+						//restore the last bit
+						bits.set(lastDoc);
+					}					
+				}
+				if(!te.next())
+				{
+					break;
+				}
+				currTerm=te.term();
+			}
+		}
+		return bits;
+	}
+
+	/**
+	 * @param args
+	 * @throws IOException 
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception
+	{
+		IndexReader r=IndexReader.open("/indexes/personCentricAnon");
+//		IndexReader r=IndexReader.open("/indexes/enron");
+		long start=System.currentTimeMillis();
+//		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
+//		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+		DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+//		DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
+//		df.setProcessingMode(PM_SLOW_VALIDATION);
+		BitSet b = df.bits(r);
+		long end=System.currentTimeMillis()-start;
+		System.out.println(b.cardinality()+" in "+end+" ms ");
+
+	}
+
+
+	public String getFieldName()
+	{
+		return fieldName;
+	}
+
+
+	public void setFieldName(String fieldName)
+	{
+		this.fieldName = fieldName;
+	}
+
+
+	public int getKeepMode()
+	{
+		return keepMode;
+	}
+
+
+	public void setKeepMode(int keepMode)
+	{
+		this.keepMode = keepMode;
+	}
+
+
+	public boolean equals(Object obj)
+	{
+		if(this == obj)
+			return true;
+		if((obj == null) || (obj.getClass() != this.getClass()))
+			return false;
+		DuplicateFilter other = (DuplicateFilter)obj;
+		return keepMode == other.keepMode &&
+		processingMode == other.processingMode &&
+			(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
+	}
+
+
+
+	public int hashCode()
+	{
+		int hash = 217;
+		hash = 31 * hash + keepMode;
+		hash = 31 * hash + processingMode;
+		hash = 31 * hash + fieldName.hashCode();
+		return hash;	
+	}
+
+
+	public int getProcessingMode()
+	{
+		return processingMode;
+	}
+
+
+	public void setProcessingMode(int processingMode)
+	{
+		this.processingMode = processingMode;
+	}
+	
+	
+
+}

Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java Mon Nov 10 18:35:46 2008
@@ -1,66 +1,66 @@
-package org.apache.lucene.search;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.search.BooleanClause.Occur;
-
-/**
- * A Filter that wrapped with an indication of how that filter
- * is used when composed with another filter.
- * (Follows the boolean logic in BooleanClause for composition 
- * of queries.)
- */
-
-public class FilterClause implements java.io.Serializable
-{
-	Occur occur = null;
-	Filter filter = null;
-
-	/**
-	 * Create a new FilterClause
-	 * @param filter A Filter object containing a BitSet
-	 * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
-	 */
-	
-	public FilterClause( Filter filter,Occur occur)
-	{
-		this.occur = occur;
-		this.filter = filter;
-	}
-
-	/**
-	 * Returns this FilterClause's filter
-	 * @return A Filter object
-	 */
-	
-	public Filter getFilter()
-	{
-		return filter;
-	}
-
-	/**
-	 * Returns this FilterClause's occur parameter
-	 * @return An Occur object
-	 */
-	
-	public Occur getOccur()
-	{
-		return occur;
-	}
-
-}
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.BooleanClause.Occur;
+
+/**
+ * A Filter that wrapped with an indication of how that filter
+ * is used when composed with another filter.
+ * (Follows the boolean logic in BooleanClause for composition 
+ * of queries.)
+ */
+
+public class FilterClause implements java.io.Serializable
+{
+	Occur occur = null;
+	Filter filter = null;
+
+	/**
+	 * Create a new FilterClause
+	 * @param filter A Filter object containing a BitSet
+	 * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
+	 */
+	
+	public FilterClause( Filter filter,Occur occur)
+	{
+		this.occur = occur;
+		this.filter = filter;
+	}
+
+	/**
+	 * Returns this FilterClause's filter
+	 * @return A Filter object
+	 */
+	
+	public Filter getFilter()
+	{
+		return filter;
+	}
+
+	/**
+	 * Returns this FilterClause's occur parameter
+	 * @return An Occur object
+	 */
+	
+	public Occur getOccur()
+	{
+		return occur;
+	}
+
+}

Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java Mon Nov 10 18:35:46 2008
@@ -1,322 +1,322 @@
-package org.apache.lucene.search;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.util.PriorityQueue;
-
-/**
- * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
- * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
- * of fuzzy scoring factors.
- * This generally produces good results for queries where users may provide details in a number of 
- * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
- * a fast query.
- * 
- * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
- * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
- * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer 
- * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query 
- * term) and this is factored into the variant's boost. If the source query term does not exist in the
- * index the average IDF of the variants is used.
- */
-public class FuzzyLikeThisQuery extends Query
-{
-    static Similarity sim=new DefaultSimilarity();
-    Query rewrittenQuery=null;
-    ArrayList fieldVals=new ArrayList();
-    Analyzer analyzer;
-    
-    ScoreTermQueue q;
-    int MAX_VARIANTS_PER_TERM=50;
-    boolean ignoreTF=false;
-
-    
-    /**
-     * 
-     * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
-     * @param analyzer
-     */
-    public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
-    {
-        q=new ScoreTermQueue(maxNumTerms);
-        this.analyzer=analyzer;
-    }
-
-    class FieldVals
-    {
-    	String queryString;
-    	String fieldName;
-    	float minSimilarity;
-    	int prefixLength;
-		public FieldVals(String name, float similarity, int length, String queryString)
-		{
-			fieldName = name;
-			minSimilarity = similarity;
-			prefixLength = length;
-			this.queryString = queryString;
-		}
-    	
-    }
-    
-    /**
-     * Adds user input for "fuzzification" 
-     * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
-     * @param fieldName
-     * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
-     * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
-     */
-    public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) 
-    {
-    	fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
-    }
-    
-    
-    private void addTerms(IndexReader reader,FieldVals f) throws IOException
-    {
-        if(f.queryString==null) return;
-        TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
-        final Token reusableToken = new Token();
-        int corpusNumDocs=reader.numDocs();
-        Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
-        HashSet processedTerms=new HashSet();
-        for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
-        {
-                String term = nextToken.term();
-        	if(!processedTerms.contains(term))
-        	{
-        		processedTerms.add(term);
-                ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
-                float minScore=0;
-                Term startTerm=internSavingTemplateTerm.createTerm(term);
-                FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
-                TermEnum origEnum = reader.terms(startTerm);
-                int df=0;
-                if(startTerm.equals(origEnum.term()))
-                {
-                    df=origEnum.docFreq(); //store the df so all variants use same idf
-                }
-                int numVariants=0;
-                int totalVariantDocFreqs=0;
-                do
-                {
-                    Term possibleMatch=fe.term();
-                    if(possibleMatch!=null)
-                    {
-    	                numVariants++;
-    	                totalVariantDocFreqs+=fe.docFreq();
-    	                float score=fe.difference();
-    	                if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
-    	                    ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);                    
-    	                    variantsQ.insert(st);
-    	                    minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore
-    	                }
-                    }
-                }
-                while(fe.next());
-                if(numVariants>0)
-                {
-	                int avgDf=totalVariantDocFreqs/numVariants;
-	                if(df==0)//no direct match we can use as df for all variants 
-	                {
-	                    df=avgDf; //use avg df of all variants
-	                }
-	                
-	                // take the top variants (scored by edit distance) and reset the score
-	                // to include an IDF factor then add to the global queue for ranking 
-	                // overall top query terms
-	                int size = variantsQ.size();
-	                for(int i = 0; i < size; i++)
-	                {
-	                  ScoreTerm st = (ScoreTerm) variantsQ.pop();
-	                  st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
-	                  q.insert(st);
-	                }                            
-                }
-        	}
-        }     
-    }
-            
-    public Query rewrite(IndexReader reader) throws IOException
-    {
-        if(rewrittenQuery!=null)
-        {
-            return rewrittenQuery;
-        }
-        //load up the list of possible terms
-        for (Iterator iter = fieldVals.iterator(); iter.hasNext();)
-		{
-			FieldVals f = (FieldVals) iter.next();
-			addTerms(reader,f);			
-		}
-        //clear the list of fields
-        fieldVals.clear();
-        
-        BooleanQuery bq=new BooleanQuery();
-        
-        
-        //create BooleanQueries to hold the variants for each token/field pair and ensure it
-        // has no coord factor
-        //Step 1: sort the termqueries by term/field
-        HashMap variantQueries=new HashMap();
-        int size = q.size();
-        for(int i = 0; i < size; i++)
-        {
-          ScoreTerm st = (ScoreTerm) q.pop();
-          ArrayList l=(ArrayList) variantQueries.get(st.fuzziedSourceTerm);
-          if(l==null)
-          {
-              l=new ArrayList();
-              variantQueries.put(st.fuzziedSourceTerm,l);
-          }
-          l.add(st);
-        }
-        //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
-        for (Iterator iter = variantQueries.values().iterator(); iter.hasNext();)
-        {
-            ArrayList variants = (ArrayList) iter.next();
-            if(variants.size()==1)
-            {
-                //optimize where only one selected variant
-                ScoreTerm st=(ScoreTerm) variants.get(0);
-                TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
-                tq.setBoost(st.score); // set the boost to a mix of IDF and score
-                bq.add(tq, BooleanClause.Occur.SHOULD); 
-            }
-            else
-            {
-                BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
-                for (Iterator iterator2 = variants.iterator(); iterator2
-                        .hasNext();)
-                {
-                    ScoreTerm st = (ScoreTerm) iterator2.next();
-                    TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);      // found a match
-                    tq.setBoost(st.score); // set the boost using the ScoreTerm's score
-                    termVariants.add(tq, BooleanClause.Occur.SHOULD);          // add to query                    
-                }
-                bq.add(termVariants, BooleanClause.Occur.SHOULD);          // add to query
-            }
-        }
-        //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
-        // booleans with a minimum-should-match of NumFields-1?
-        bq.setBoost(getBoost());
-        this.rewrittenQuery=bq;
-        return bq;
-    }
-    
-    //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
-    // term variants) then is reset with IDF for use in ranking against all other
-    // terms/fields
-    private static class ScoreTerm{
-        public Term term;
-        public float score;
-        Term fuzziedSourceTerm;
-        
-        public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
-          this.term = term;
-          this.score = score;
-          this.fuzziedSourceTerm=fuzziedSourceTerm;
-        }
-      }
-      
-      private static class ScoreTermQueue extends PriorityQueue {        
-        public ScoreTermQueue(int size){
-          initialize(size);
-        }
-        
-        /* (non-Javadoc)
-         * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
-         */
-        protected boolean lessThan(Object a, Object b) {
-          ScoreTerm termA = (ScoreTerm)a;
-          ScoreTerm termB = (ScoreTerm)b;
-          if (termA.score== termB.score)
-            return termA.term.compareTo(termB.term) > 0;
-          else
-            return termA.score < termB.score;
-        }
-        
-      }
-      
-      //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
-      private static class FuzzyTermQuery extends TermQuery
-      {
-    	  boolean ignoreTF;
-          public FuzzyTermQuery(Term t, boolean ignoreTF)
-          {
-        	  super(t);
-        	  this.ignoreTF=ignoreTF;
-          }
-          public Similarity getSimilarity(Searcher searcher)
-          {            
-              Similarity result = super.getSimilarity(searcher);
-              result = new SimilarityDelegator(result) {
-                  
-                  public float tf(float freq)
-                  {
-                	  if(ignoreTF)
-                	  {
-                          return 1; //ignore tf
-                	  }
-            		  return super.tf(freq);
-                  }
-                  public float idf(int docFreq, int numDocs)
-                  {
-                      //IDF is already factored into individual term boosts
-                      return 1;
-                  }               
-              };
-              return result;
-          }        
-      }
-      
-      
-
-    /* (non-Javadoc)
-     * @see org.apache.lucene.search.Query#toString(java.lang.String)
-     */
-    public String toString(String field)
-    {
-        return null;
-    }
-
-
-	public boolean isIgnoreTF()
-	{
-		return ignoreTF;
-	}
-
-
-	public void setIgnoreTF(boolean ignoreTF)
-	{
-		this.ignoreTF = ignoreTF;
-	}   
-    
-}
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
+ * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
+ * of fuzzy scoring factors.
+ * This generally produces good results for queries where users may provide details in a number of 
+ * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
+ * a fast query.
+ * 
+ * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
+ * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
+ * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer 
+ * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query 
+ * term) and this is factored into the variant's boost. If the source query term does not exist in the
+ * index the average IDF of the variants is used.
+ */
+public class FuzzyLikeThisQuery extends Query
+{
+    static Similarity sim=new DefaultSimilarity();
+    Query rewrittenQuery=null;
+    ArrayList fieldVals=new ArrayList();
+    Analyzer analyzer;
+    
+    ScoreTermQueue q;
+    int MAX_VARIANTS_PER_TERM=50;
+    boolean ignoreTF=false;
+
+    
+    /**
+     * 
+     * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
+     * @param analyzer
+     */
+    public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
+    {
+        q=new ScoreTermQueue(maxNumTerms);
+        this.analyzer=analyzer;
+    }
+
+    class FieldVals
+    {
+    	String queryString;
+    	String fieldName;
+    	float minSimilarity;
+    	int prefixLength;
+		public FieldVals(String name, float similarity, int length, String queryString)
+		{
+			fieldName = name;
+			minSimilarity = similarity;
+			prefixLength = length;
+			this.queryString = queryString;
+		}
+    	
+    }
+    
+    /**
+     * Adds user input for "fuzzification" 
+     * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
+     * @param fieldName
+     * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
+     * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
+     */
+    public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) 
+    {
+    	fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
+    }
+    
+    
+    private void addTerms(IndexReader reader,FieldVals f) throws IOException
+    {
+        if(f.queryString==null) return;
+        TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
+        final Token reusableToken = new Token();
+        int corpusNumDocs=reader.numDocs();
+        Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
+        HashSet processedTerms=new HashSet();
+        for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
+        {
+                String term = nextToken.term();
+        	if(!processedTerms.contains(term))
+        	{
+        		processedTerms.add(term);
+                ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
+                float minScore=0;
+                Term startTerm=internSavingTemplateTerm.createTerm(term);
+                FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
+                TermEnum origEnum = reader.terms(startTerm);
+                int df=0;
+                if(startTerm.equals(origEnum.term()))
+                {
+                    df=origEnum.docFreq(); //store the df so all variants use same idf
+                }
+                int numVariants=0;
+                int totalVariantDocFreqs=0;
+                do
+                {
+                    Term possibleMatch=fe.term();
+                    if(possibleMatch!=null)
+                    {
+    	                numVariants++;
+    	                totalVariantDocFreqs+=fe.docFreq();
+    	                float score=fe.difference();
+    	                if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
+    	                    ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);                    
+    	                    variantsQ.insert(st);
+    	                    minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore
+    	                }
+                    }
+                }
+                while(fe.next());
+                if(numVariants>0)
+                {
+	                int avgDf=totalVariantDocFreqs/numVariants;
+	                if(df==0)//no direct match we can use as df for all variants 
+	                {
+	                    df=avgDf; //use avg df of all variants
+	                }
+	                
+	                // take the top variants (scored by edit distance) and reset the score
+	                // to include an IDF factor then add to the global queue for ranking 
+	                // overall top query terms
+	                int size = variantsQ.size();
+	                for(int i = 0; i < size; i++)
+	                {
+	                  ScoreTerm st = (ScoreTerm) variantsQ.pop();
+	                  st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
+	                  q.insert(st);
+	                }                            
+                }
+        	}
+        }     
+    }
+            
+    public Query rewrite(IndexReader reader) throws IOException
+    {
+        if(rewrittenQuery!=null)
+        {
+            return rewrittenQuery;
+        }
+        //load up the list of possible terms
+        for (Iterator iter = fieldVals.iterator(); iter.hasNext();)
+		{
+			FieldVals f = (FieldVals) iter.next();
+			addTerms(reader,f);			
+		}
+        //clear the list of fields
+        fieldVals.clear();
+        
+        BooleanQuery bq=new BooleanQuery();
+        
+        
+        //create BooleanQueries to hold the variants for each token/field pair and ensure it
+        // has no coord factor
+        //Step 1: sort the termqueries by term/field
+        HashMap variantQueries=new HashMap();
+        int size = q.size();
+        for(int i = 0; i < size; i++)
+        {
+          ScoreTerm st = (ScoreTerm) q.pop();
+          ArrayList l=(ArrayList) variantQueries.get(st.fuzziedSourceTerm);
+          if(l==null)
+          {
+              l=new ArrayList();
+              variantQueries.put(st.fuzziedSourceTerm,l);
+          }
+          l.add(st);
+        }
+        //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
+        for (Iterator iter = variantQueries.values().iterator(); iter.hasNext();)
+        {
+            ArrayList variants = (ArrayList) iter.next();
+            if(variants.size()==1)
+            {
+                //optimize where only one selected variant
+                ScoreTerm st=(ScoreTerm) variants.get(0);
+                TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
+                tq.setBoost(st.score); // set the boost to a mix of IDF and score
+                bq.add(tq, BooleanClause.Occur.SHOULD); 
+            }
+            else
+            {
+                BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
+                for (Iterator iterator2 = variants.iterator(); iterator2
+                        .hasNext();)
+                {
+                    ScoreTerm st = (ScoreTerm) iterator2.next();
+                    TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);      // found a match
+                    tq.setBoost(st.score); // set the boost using the ScoreTerm's score
+                    termVariants.add(tq, BooleanClause.Occur.SHOULD);          // add to query                    
+                }
+                bq.add(termVariants, BooleanClause.Occur.SHOULD);          // add to query
+            }
+        }
+        //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
+        // booleans with a minimum-should-match of NumFields-1?
+        bq.setBoost(getBoost());
+        this.rewrittenQuery=bq;
+        return bq;
+    }
+    
+    //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
+    // term variants) then is reset with IDF for use in ranking against all other
+    // terms/fields
+    private static class ScoreTerm{
+        public Term term;
+        public float score;
+        Term fuzziedSourceTerm;
+        
+        public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
+          this.term = term;
+          this.score = score;
+          this.fuzziedSourceTerm=fuzziedSourceTerm;
+        }
+      }
+      
+      private static class ScoreTermQueue extends PriorityQueue {        
+        public ScoreTermQueue(int size){
+          initialize(size);
+        }
+        
+        /* (non-Javadoc)
+         * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
+         */
+        protected boolean lessThan(Object a, Object b) {
+          ScoreTerm termA = (ScoreTerm)a;
+          ScoreTerm termB = (ScoreTerm)b;
+          if (termA.score== termB.score)
+            return termA.term.compareTo(termB.term) > 0;
+          else
+            return termA.score < termB.score;
+        }
+        
+      }
+      
+      //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
+      private static class FuzzyTermQuery extends TermQuery
+      {
+    	  boolean ignoreTF;
+          public FuzzyTermQuery(Term t, boolean ignoreTF)
+          {
+        	  super(t);
+        	  this.ignoreTF=ignoreTF;
+          }
+          public Similarity getSimilarity(Searcher searcher)
+          {            
+              Similarity result = super.getSimilarity(searcher);
+              result = new SimilarityDelegator(result) {
+                  
+                  public float tf(float freq)
+                  {
+                	  if(ignoreTF)
+                	  {
+                          return 1; //ignore tf
+                	  }
+            		  return super.tf(freq);
+                  }
+                  public float idf(int docFreq, int numDocs)
+                  {
+                      //IDF is already factored into individual term boosts
+                      return 1;
+                  }               
+              };
+              return result;
+          }        
+      }
+      
+      
+
+    /* (non-Javadoc)
+     * @see org.apache.lucene.search.Query#toString(java.lang.String)
+     */
+    public String toString(String field)
+    {
+        return null;
+    }
+
+
+	public boolean isIgnoreTF()
+	{
+		return ignoreTF;
+	}
+
+
+	public void setIgnoreTF(boolean ignoreTF)
+	{
+		this.ignoreTF = ignoreTF;
+	}   
+    
+}

Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java Mon Nov 10 18:35:46 2008
@@ -1,130 +1,130 @@
-package org.apache.lucene.search;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.BitSet;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.util.OpenBitSet;
-
-/**
- * Constructs a filter for docs matching any of the terms added to this class. 
- * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in 
- * a sequence. An example might be a collection of primary keys from a database query result or perhaps 
- * a choice of "category" labels picked by the end user. As a filter, this is much faster than the 
- * equivalent query (a BooleanQuery with many "should" TermQueries)
- *
- */
-public class TermsFilter extends Filter
-{
-	Set terms=new TreeSet();
-	
-	/**
-	 * Adds a term to the list of acceptable terms   
-	 * @param term
-	 */
-	public void addTerm(Term term)
-	{
-		terms.add(term);
-	}
-	
-	
-
-	/* (non-Javadoc)
-	 * @see org.apache.lucene.search.Filter#bits(org.apache.lucene.index.IndexReader)
-	 */
-	public BitSet bits(IndexReader reader) throws IOException
-	{
-		BitSet result=new BitSet(reader.maxDoc());
-        TermDocs td = reader.termDocs();
-        try
-        {
-            for (Iterator iter = terms.iterator(); iter.hasNext();)
-            {
-                Term term = (Term) iter.next();
-                td.seek(term);
-                while (td.next())
-                {
-                    result.set(td.doc());
-                }
-            }
-        }
-        finally
-        {
-            td.close();
-        }
-        return result;
-	}
-
-
-
-/* (non-Javadoc)
-   * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
-	 */
-  public DocIdSet getDocIdSet(IndexReader reader) throws IOException
-	{
-    OpenBitSet result=new OpenBitSet(reader.maxDoc());
-        TermDocs td = reader.termDocs();
-        try
-        {
-            for (Iterator iter = terms.iterator(); iter.hasNext();)
-            {
-                Term term = (Term) iter.next();
-                td.seek(term);
-                while (td.next())
-                {
-                    result.set(td.doc());
-                }
-            }
-        }
-        finally
-        {
-            td.close();
-        }
-        return result;
-	}
-	
-	public boolean equals(Object obj)
-	{
-		if(this == obj)
-			return true;
-		if((obj == null) || (obj.getClass() != this.getClass()))
-				return false;
-		TermsFilter test = (TermsFilter)obj;
-		return (terms == test.terms ||
-					 (terms != null && terms.equals(test.terms)));
-	}
-
-	public int hashCode()
-	{
-		int hash=9;
-		for (Iterator iter = terms.iterator(); iter.hasNext();)
-		{
-			Term term = (Term) iter.next();
-			hash = 31 * hash + term.hashCode();			
-		}
-		return hash;
-	}
-	
-}
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.util.OpenBitSet;
+
+/**
+ * Constructs a filter for docs matching any of the terms added to this class. 
+ * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in 
+ * a sequence. An example might be a collection of primary keys from a database query result or perhaps 
+ * a choice of "category" labels picked by the end user. As a filter, this is much faster than the 
+ * equivalent query (a BooleanQuery with many "should" TermQueries)
+ *
+ */
+public class TermsFilter extends Filter
+{
+	Set terms=new TreeSet();
+	
+	/**
+	 * Adds a term to the list of acceptable terms   
+	 * @param term
+	 */
+	public void addTerm(Term term)
+	{
+		terms.add(term);
+	}
+	
+	
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.Filter#bits(org.apache.lucene.index.IndexReader)
+	 */
+	public BitSet bits(IndexReader reader) throws IOException
+	{
+		BitSet result=new BitSet(reader.maxDoc());
+        TermDocs td = reader.termDocs();
+        try
+        {
+            for (Iterator iter = terms.iterator(); iter.hasNext();)
+            {
+                Term term = (Term) iter.next();
+                td.seek(term);
+                while (td.next())
+                {
+                    result.set(td.doc());
+                }
+            }
+        }
+        finally
+        {
+            td.close();
+        }
+        return result;
+	}
+
+
+
+/* (non-Javadoc)
+   * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
+	 */
+  public DocIdSet getDocIdSet(IndexReader reader) throws IOException
+	{
+    OpenBitSet result=new OpenBitSet(reader.maxDoc());
+        TermDocs td = reader.termDocs();
+        try
+        {
+            for (Iterator iter = terms.iterator(); iter.hasNext();)
+            {
+                Term term = (Term) iter.next();
+                td.seek(term);
+                while (td.next())
+                {
+                    result.set(td.doc());
+                }
+            }
+        }
+        finally
+        {
+            td.close();
+        }
+        return result;
+	}
+	
+	public boolean equals(Object obj)
+	{
+		if(this == obj)
+			return true;
+		if((obj == null) || (obj.getClass() != this.getClass()))
+				return false;
+		TermsFilter test = (TermsFilter)obj;
+		return (terms == test.terms ||
+					 (terms != null && terms.equals(test.terms)));
+	}
+
+	public int hashCode()
+	{
+		int hash=9;
+		for (Iterator iter = terms.iterator(); iter.hasNext();)
+		{
+			Term term = (Term) iter.next();
+			hash = 31 * hash + term.hashCode();			
+		}
+		return hash;
+	}
+	
+}

Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native