You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by yo...@apache.org on 2008/11/11 03:35:52 UTC
svn commit: r712922 [3/9] - in /lucene/java/trunk:
contrib/analyzers/src/java/org/apache/lucene/analysis/el/
contrib/analyzers/src/java/org/apache/lucene/analysis/fr/
contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/
contrib/analyzer...
Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java Mon Nov 10 18:35:46 2008
@@ -1,248 +1,248 @@
-package org.apache.lucene.search;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-import java.io.IOException;
-import java.util.BitSet;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.util.OpenBitSet;
-
-public class DuplicateFilter extends Filter
-{
-
- String fieldName;
-
- /**
- * KeepMode determines which document id to consider as the master, all others being
- * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
- */
- int keepMode=KM_USE_FIRST_OCCURRENCE;
- public static final int KM_USE_FIRST_OCCURRENCE=1;
- public static final int KM_USE_LAST_OCCURRENCE=2;
-
- /**
- * "Full" processing mode starts by setting all bits to false and only setting bits
- * for documents that contain the given field and are identified as none-duplicates.
-
- * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
- * given field. This approach avoids the need to read TermDocs for terms that are seen
- * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
- * faster approach , the downside is that bitsets produced will include bits set for
- * documents that do not actually contain the field given.
- *
- */
- int processingMode=PM_FULL_VALIDATION;
- public static final int PM_FULL_VALIDATION=1;
- public static final int PM_FAST_INVALIDATION=2;
-
-
-
- public DuplicateFilter(String fieldName)
- {
- this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
- }
-
-
- public DuplicateFilter(String fieldName, int keepMode, int processingMode)
- {
- this.fieldName = fieldName;
- this.keepMode = keepMode;
- this.processingMode = processingMode;
- }
-
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException
- {
- if(processingMode==PM_FAST_INVALIDATION)
- {
- return fastBits(reader);
- }
- else
- {
- return correctBits(reader);
- }
- }
-
- private OpenBitSet correctBits(IndexReader reader) throws IOException
- {
-
- OpenBitSet bits=new OpenBitSet(reader.maxDoc()); //assume all are INvalid
- Term startTerm=new Term(fieldName);
- TermEnum te = reader.terms(startTerm);
- if(te!=null)
- {
- Term currTerm=te.term();
- while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
- {
- int lastDoc=-1;
- //set non duplicates
- TermDocs td = reader.termDocs(currTerm);
- if(td.next())
- {
- if(keepMode==KM_USE_FIRST_OCCURRENCE)
- {
- bits.set(td.doc());
- }
- else
- {
- do
- {
- lastDoc=td.doc();
- }while(td.next());
- bits.set(lastDoc);
- }
- }
- if(!te.next())
- {
- break;
- }
- currTerm=te.term();
- }
- }
- return bits;
- }
-
- private OpenBitSet fastBits(IndexReader reader) throws IOException
- {
-
- OpenBitSet bits=new OpenBitSet(reader.maxDoc());
- bits.set(0,reader.maxDoc()); //assume all are valid
- Term startTerm=new Term(fieldName);
- TermEnum te = reader.terms(startTerm);
- if(te!=null)
- {
- Term currTerm=te.term();
-
- while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
- {
- if(te.docFreq()>1)
- {
- int lastDoc=-1;
- //unset potential duplicates
- TermDocs td = reader.termDocs(currTerm);
- td.next();
- if(keepMode==KM_USE_FIRST_OCCURRENCE)
- {
- td.next();
- }
- do
- {
- lastDoc=td.doc();
- bits.clear(lastDoc);
- }while(td.next());
- if(keepMode==KM_USE_LAST_OCCURRENCE)
- {
- //restore the last bit
- bits.set(lastDoc);
- }
- }
- if(!te.next())
- {
- break;
- }
- currTerm=te.term();
- }
- }
- return bits;
- }
-
- /**
- * @param args
- * @throws IOException
- * @throws Exception
- */
- public static void main(String[] args) throws Exception
- {
- IndexReader r=IndexReader.open("/indexes/personCentricAnon");
-// IndexReader r=IndexReader.open("/indexes/enron");
- long start=System.currentTimeMillis();
-// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
-// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
- DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
-// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
-// df.setProcessingMode(PM_SLOW_VALIDATION);
- BitSet b = df.bits(r);
- long end=System.currentTimeMillis()-start;
- System.out.println(b.cardinality()+" in "+end+" ms ");
-
- }
-
-
- public String getFieldName()
- {
- return fieldName;
- }
-
-
- public void setFieldName(String fieldName)
- {
- this.fieldName = fieldName;
- }
-
-
- public int getKeepMode()
- {
- return keepMode;
- }
-
-
- public void setKeepMode(int keepMode)
- {
- this.keepMode = keepMode;
- }
-
-
- public boolean equals(Object obj)
- {
- if(this == obj)
- return true;
- if((obj == null) || (obj.getClass() != this.getClass()))
- return false;
- DuplicateFilter other = (DuplicateFilter)obj;
- return keepMode == other.keepMode &&
- processingMode == other.processingMode &&
- (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
- }
-
-
-
- public int hashCode()
- {
- int hash = 217;
- hash = 31 * hash + keepMode;
- hash = 31 * hash + processingMode;
- hash = 31 * hash + fieldName.hashCode();
- return hash;
- }
-
-
- public int getProcessingMode()
- {
- return processingMode;
- }
-
-
- public void setProcessingMode(int processingMode)
- {
- this.processingMode = processingMode;
- }
-
-
-
-}
+package org.apache.lucene.search;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.BitSet;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.util.OpenBitSet;
+
+public class DuplicateFilter extends Filter
+{
+
+ String fieldName;
+
+ /**
+ * KeepMode determines which document id to consider as the master, all others being
+ * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
+ */
+ int keepMode=KM_USE_FIRST_OCCURRENCE;
+ public static final int KM_USE_FIRST_OCCURRENCE=1;
+ public static final int KM_USE_LAST_OCCURRENCE=2;
+
+ /**
+ * "Full" processing mode starts by setting all bits to false and only setting bits
+ * for documents that contain the given field and are identified as none-duplicates.
+
+ * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
+ * given field. This approach avoids the need to read TermDocs for terms that are seen
+ * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
+ * faster approach , the downside is that bitsets produced will include bits set for
+ * documents that do not actually contain the field given.
+ *
+ */
+ int processingMode=PM_FULL_VALIDATION;
+ public static final int PM_FULL_VALIDATION=1;
+ public static final int PM_FAST_INVALIDATION=2;
+
+
+
+ public DuplicateFilter(String fieldName)
+ {
+ this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
+ }
+
+
+ public DuplicateFilter(String fieldName, int keepMode, int processingMode)
+ {
+ this.fieldName = fieldName;
+ this.keepMode = keepMode;
+ this.processingMode = processingMode;
+ }
+
+ public DocIdSet getDocIdSet(IndexReader reader) throws IOException
+ {
+ if(processingMode==PM_FAST_INVALIDATION)
+ {
+ return fastBits(reader);
+ }
+ else
+ {
+ return correctBits(reader);
+ }
+ }
+
+ private OpenBitSet correctBits(IndexReader reader) throws IOException
+ {
+
+ OpenBitSet bits=new OpenBitSet(reader.maxDoc()); //assume all are INvalid
+ Term startTerm=new Term(fieldName);
+ TermEnum te = reader.terms(startTerm);
+ if(te!=null)
+ {
+ Term currTerm=te.term();
+ while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+ {
+ int lastDoc=-1;
+ //set non duplicates
+ TermDocs td = reader.termDocs(currTerm);
+ if(td.next())
+ {
+ if(keepMode==KM_USE_FIRST_OCCURRENCE)
+ {
+ bits.set(td.doc());
+ }
+ else
+ {
+ do
+ {
+ lastDoc=td.doc();
+ }while(td.next());
+ bits.set(lastDoc);
+ }
+ }
+ if(!te.next())
+ {
+ break;
+ }
+ currTerm=te.term();
+ }
+ }
+ return bits;
+ }
+
+ private OpenBitSet fastBits(IndexReader reader) throws IOException
+ {
+
+ OpenBitSet bits=new OpenBitSet(reader.maxDoc());
+ bits.set(0,reader.maxDoc()); //assume all are valid
+ Term startTerm=new Term(fieldName);
+ TermEnum te = reader.terms(startTerm);
+ if(te!=null)
+ {
+ Term currTerm=te.term();
+
+ while((currTerm!=null)&&(currTerm.field()==startTerm.field())) //term fieldnames are interned
+ {
+ if(te.docFreq()>1)
+ {
+ int lastDoc=-1;
+ //unset potential duplicates
+ TermDocs td = reader.termDocs(currTerm);
+ td.next();
+ if(keepMode==KM_USE_FIRST_OCCURRENCE)
+ {
+ td.next();
+ }
+ do
+ {
+ lastDoc=td.doc();
+ bits.clear(lastDoc);
+ }while(td.next());
+ if(keepMode==KM_USE_LAST_OCCURRENCE)
+ {
+ //restore the last bit
+ bits.set(lastDoc);
+ }
+ }
+ if(!te.next())
+ {
+ break;
+ }
+ currTerm=te.term();
+ }
+ }
+ return bits;
+ }
+
+ /**
+ * @param args
+ * @throws IOException
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception
+ {
+ IndexReader r=IndexReader.open("/indexes/personCentricAnon");
+// IndexReader r=IndexReader.open("/indexes/enron");
+ long start=System.currentTimeMillis();
+// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE, PM_FAST_INVALIDATION);
+// DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+ DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE, PM_FAST_INVALIDATION);
+// DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
+// df.setProcessingMode(PM_SLOW_VALIDATION);
+ BitSet b = df.bits(r);
+ long end=System.currentTimeMillis()-start;
+ System.out.println(b.cardinality()+" in "+end+" ms ");
+
+ }
+
+
+ public String getFieldName()
+ {
+ return fieldName;
+ }
+
+
+ public void setFieldName(String fieldName)
+ {
+ this.fieldName = fieldName;
+ }
+
+
+ public int getKeepMode()
+ {
+ return keepMode;
+ }
+
+
+ public void setKeepMode(int keepMode)
+ {
+ this.keepMode = keepMode;
+ }
+
+
+ public boolean equals(Object obj)
+ {
+ if(this == obj)
+ return true;
+ if((obj == null) || (obj.getClass() != this.getClass()))
+ return false;
+ DuplicateFilter other = (DuplicateFilter)obj;
+ return keepMode == other.keepMode &&
+ processingMode == other.processingMode &&
+ (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
+ }
+
+
+
+ public int hashCode()
+ {
+ int hash = 217;
+ hash = 31 * hash + keepMode;
+ hash = 31 * hash + processingMode;
+ hash = 31 * hash + fieldName.hashCode();
+ return hash;
+ }
+
+
+ public int getProcessingMode()
+ {
+ return processingMode;
+ }
+
+
+ public void setProcessingMode(int processingMode)
+ {
+ this.processingMode = processingMode;
+ }
+
+
+
+}
Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java Mon Nov 10 18:35:46 2008
@@ -1,66 +1,66 @@
-package org.apache.lucene.search;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.search.BooleanClause.Occur;
-
-/**
- * A Filter that wrapped with an indication of how that filter
- * is used when composed with another filter.
- * (Follows the boolean logic in BooleanClause for composition
- * of queries.)
- */
-
-public class FilterClause implements java.io.Serializable
-{
- Occur occur = null;
- Filter filter = null;
-
- /**
- * Create a new FilterClause
- * @param filter A Filter object containing a BitSet
- * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
- */
-
- public FilterClause( Filter filter,Occur occur)
- {
- this.occur = occur;
- this.filter = filter;
- }
-
- /**
- * Returns this FilterClause's filter
- * @return A Filter object
- */
-
- public Filter getFilter()
- {
- return filter;
- }
-
- /**
- * Returns this FilterClause's occur parameter
- * @return An Occur object
- */
-
- public Occur getOccur()
- {
- return occur;
- }
-
-}
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.BooleanClause.Occur;
+
+/**
+ * A Filter that wrapped with an indication of how that filter
+ * is used when composed with another filter.
+ * (Follows the boolean logic in BooleanClause for composition
+ * of queries.)
+ */
+
+public class FilterClause implements java.io.Serializable
+{
+ Occur occur = null;
+ Filter filter = null;
+
+ /**
+ * Create a new FilterClause
+ * @param filter A Filter object containing a BitSet
+ * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
+ */
+
+ public FilterClause( Filter filter,Occur occur)
+ {
+ this.occur = occur;
+ this.filter = filter;
+ }
+
+ /**
+ * Returns this FilterClause's filter
+ * @return A Filter object
+ */
+
+ public Filter getFilter()
+ {
+ return filter;
+ }
+
+ /**
+ * Returns this FilterClause's occur parameter
+ * @return An Occur object
+ */
+
+ public Occur getOccur()
+ {
+ return occur;
+ }
+
+}
Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java Mon Nov 10 18:35:46 2008
@@ -1,322 +1,322 @@
-package org.apache.lucene.search;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.util.PriorityQueue;
-
-/**
- * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
- * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
- * of fuzzy scoring factors.
- * This generally produces good results for queries where users may provide details in a number of
- * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
- * a fast query.
- *
- * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
- * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
- * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
- * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
- * term) and this is factored into the variant's boost. If the source query term does not exist in the
- * index the average IDF of the variants is used.
- */
-public class FuzzyLikeThisQuery extends Query
-{
- static Similarity sim=new DefaultSimilarity();
- Query rewrittenQuery=null;
- ArrayList fieldVals=new ArrayList();
- Analyzer analyzer;
-
- ScoreTermQueue q;
- int MAX_VARIANTS_PER_TERM=50;
- boolean ignoreTF=false;
-
-
- /**
- *
- * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
- * @param analyzer
- */
- public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
- {
- q=new ScoreTermQueue(maxNumTerms);
- this.analyzer=analyzer;
- }
-
- class FieldVals
- {
- String queryString;
- String fieldName;
- float minSimilarity;
- int prefixLength;
- public FieldVals(String name, float similarity, int length, String queryString)
- {
- fieldName = name;
- minSimilarity = similarity;
- prefixLength = length;
- this.queryString = queryString;
- }
-
- }
-
- /**
- * Adds user input for "fuzzification"
- * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
- * @param fieldName
- * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
- * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
- */
- public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
- {
- fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
- }
-
-
- private void addTerms(IndexReader reader,FieldVals f) throws IOException
- {
- if(f.queryString==null) return;
- TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
- final Token reusableToken = new Token();
- int corpusNumDocs=reader.numDocs();
- Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
- HashSet processedTerms=new HashSet();
- for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
- {
- String term = nextToken.term();
- if(!processedTerms.contains(term))
- {
- processedTerms.add(term);
- ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
- float minScore=0;
- Term startTerm=internSavingTemplateTerm.createTerm(term);
- FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
- TermEnum origEnum = reader.terms(startTerm);
- int df=0;
- if(startTerm.equals(origEnum.term()))
- {
- df=origEnum.docFreq(); //store the df so all variants use same idf
- }
- int numVariants=0;
- int totalVariantDocFreqs=0;
- do
- {
- Term possibleMatch=fe.term();
- if(possibleMatch!=null)
- {
- numVariants++;
- totalVariantDocFreqs+=fe.docFreq();
- float score=fe.difference();
- if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
- ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
- variantsQ.insert(st);
- minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore
- }
- }
- }
- while(fe.next());
- if(numVariants>0)
- {
- int avgDf=totalVariantDocFreqs/numVariants;
- if(df==0)//no direct match we can use as df for all variants
- {
- df=avgDf; //use avg df of all variants
- }
-
- // take the top variants (scored by edit distance) and reset the score
- // to include an IDF factor then add to the global queue for ranking
- // overall top query terms
- int size = variantsQ.size();
- for(int i = 0; i < size; i++)
- {
- ScoreTerm st = (ScoreTerm) variantsQ.pop();
- st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
- q.insert(st);
- }
- }
- }
- }
- }
-
- public Query rewrite(IndexReader reader) throws IOException
- {
- if(rewrittenQuery!=null)
- {
- return rewrittenQuery;
- }
- //load up the list of possible terms
- for (Iterator iter = fieldVals.iterator(); iter.hasNext();)
- {
- FieldVals f = (FieldVals) iter.next();
- addTerms(reader,f);
- }
- //clear the list of fields
- fieldVals.clear();
-
- BooleanQuery bq=new BooleanQuery();
-
-
- //create BooleanQueries to hold the variants for each token/field pair and ensure it
- // has no coord factor
- //Step 1: sort the termqueries by term/field
- HashMap variantQueries=new HashMap();
- int size = q.size();
- for(int i = 0; i < size; i++)
- {
- ScoreTerm st = (ScoreTerm) q.pop();
- ArrayList l=(ArrayList) variantQueries.get(st.fuzziedSourceTerm);
- if(l==null)
- {
- l=new ArrayList();
- variantQueries.put(st.fuzziedSourceTerm,l);
- }
- l.add(st);
- }
- //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
- for (Iterator iter = variantQueries.values().iterator(); iter.hasNext();)
- {
- ArrayList variants = (ArrayList) iter.next();
- if(variants.size()==1)
- {
- //optimize where only one selected variant
- ScoreTerm st=(ScoreTerm) variants.get(0);
- TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
- tq.setBoost(st.score); // set the boost to a mix of IDF and score
- bq.add(tq, BooleanClause.Occur.SHOULD);
- }
- else
- {
- BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
- for (Iterator iterator2 = variants.iterator(); iterator2
- .hasNext();)
- {
- ScoreTerm st = (ScoreTerm) iterator2.next();
- TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF); // found a match
- tq.setBoost(st.score); // set the boost using the ScoreTerm's score
- termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
- }
- bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
- }
- }
- //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
- // booleans with a minimum-should-match of NumFields-1?
- bq.setBoost(getBoost());
- this.rewrittenQuery=bq;
- return bq;
- }
-
- //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
- // term variants) then is reset with IDF for use in ranking against all other
- // terms/fields
- private static class ScoreTerm{
- public Term term;
- public float score;
- Term fuzziedSourceTerm;
-
- public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
- this.term = term;
- this.score = score;
- this.fuzziedSourceTerm=fuzziedSourceTerm;
- }
- }
-
- private static class ScoreTermQueue extends PriorityQueue {
- public ScoreTermQueue(int size){
- initialize(size);
- }
-
- /* (non-Javadoc)
- * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
- */
- protected boolean lessThan(Object a, Object b) {
- ScoreTerm termA = (ScoreTerm)a;
- ScoreTerm termB = (ScoreTerm)b;
- if (termA.score== termB.score)
- return termA.term.compareTo(termB.term) > 0;
- else
- return termA.score < termB.score;
- }
-
- }
-
- //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
- private static class FuzzyTermQuery extends TermQuery
- {
- boolean ignoreTF;
- public FuzzyTermQuery(Term t, boolean ignoreTF)
- {
- super(t);
- this.ignoreTF=ignoreTF;
- }
- public Similarity getSimilarity(Searcher searcher)
- {
- Similarity result = super.getSimilarity(searcher);
- result = new SimilarityDelegator(result) {
-
- public float tf(float freq)
- {
- if(ignoreTF)
- {
- return 1; //ignore tf
- }
- return super.tf(freq);
- }
- public float idf(int docFreq, int numDocs)
- {
- //IDF is already factored into individual term boosts
- return 1;
- }
- };
- return result;
- }
- }
-
-
-
- /* (non-Javadoc)
- * @see org.apache.lucene.search.Query#toString(java.lang.String)
- */
- public String toString(String field)
- {
- return null;
- }
-
-
- public boolean isIgnoreTF()
- {
- return ignoreTF;
- }
-
-
- public void setIgnoreTF(boolean ignoreTF)
- {
- this.ignoreTF = ignoreTF;
- }
-
-}
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
+ * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
+ * of fuzzy scoring factors.
+ * This generally produces good results for queries where users may provide details in a number of
+ * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
+ * a fast query.
+ *
+ * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
+ * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
+ * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
+ * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
+ * term) and this is factored into the variant's boost. If the source query term does not exist in the
+ * index the average IDF of the variants is used.
+ */
+public class FuzzyLikeThisQuery extends Query
+{
+ static Similarity sim=new DefaultSimilarity();
+ Query rewrittenQuery=null;
+ ArrayList fieldVals=new ArrayList();
+ Analyzer analyzer;
+
+ ScoreTermQueue q;
+ int MAX_VARIANTS_PER_TERM=50;
+ boolean ignoreTF=false;
+
+
+ /**
+ *
+ * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
+ * @param analyzer
+ */
+ public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
+ {
+ q=new ScoreTermQueue(maxNumTerms);
+ this.analyzer=analyzer;
+ }
+
+ class FieldVals
+ {
+ String queryString;
+ String fieldName;
+ float minSimilarity;
+ int prefixLength;
+ public FieldVals(String name, float similarity, int length, String queryString)
+ {
+ fieldName = name;
+ minSimilarity = similarity;
+ prefixLength = length;
+ this.queryString = queryString;
+ }
+
+ }
+
+ /**
+ * Adds user input for "fuzzification"
+ * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
+ * @param fieldName
+ * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
+ * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
+ */
+ public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
+ {
+ fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
+ }
+
+
+ private void addTerms(IndexReader reader,FieldVals f) throws IOException
+ {
+ if(f.queryString==null) return;
+ TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
+ final Token reusableToken = new Token();
+ int corpusNumDocs=reader.numDocs();
+ Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
+ HashSet processedTerms=new HashSet();
+ for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
+ {
+ String term = nextToken.term();
+ if(!processedTerms.contains(term))
+ {
+ processedTerms.add(term);
+ ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
+ float minScore=0;
+ Term startTerm=internSavingTemplateTerm.createTerm(term);
+ FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
+ TermEnum origEnum = reader.terms(startTerm);
+ int df=0;
+ if(startTerm.equals(origEnum.term()))
+ {
+ df=origEnum.docFreq(); //store the df so all variants use same idf
+ }
+ int numVariants=0;
+ int totalVariantDocFreqs=0;
+ do
+ {
+ Term possibleMatch=fe.term();
+ if(possibleMatch!=null)
+ {
+ numVariants++;
+ totalVariantDocFreqs+=fe.docFreq();
+ float score=fe.difference();
+ if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
+ ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
+ variantsQ.insert(st);
+ minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore
+ }
+ }
+ }
+ while(fe.next());
+ if(numVariants>0)
+ {
+ int avgDf=totalVariantDocFreqs/numVariants;
+ if(df==0)//no direct match we can use as df for all variants
+ {
+ df=avgDf; //use avg df of all variants
+ }
+
+ // take the top variants (scored by edit distance) and reset the score
+ // to include an IDF factor then add to the global queue for ranking
+ // overall top query terms
+ int size = variantsQ.size();
+ for(int i = 0; i < size; i++)
+ {
+ ScoreTerm st = (ScoreTerm) variantsQ.pop();
+ st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
+ q.insert(st);
+ }
+ }
+ }
+ }
+ }
+
+ public Query rewrite(IndexReader reader) throws IOException
+ {
+ if(rewrittenQuery!=null)
+ {
+ return rewrittenQuery;
+ }
+ //load up the list of possible terms
+ for (Iterator iter = fieldVals.iterator(); iter.hasNext();)
+ {
+ FieldVals f = (FieldVals) iter.next();
+ addTerms(reader,f);
+ }
+ //clear the list of fields
+ fieldVals.clear();
+
+ BooleanQuery bq=new BooleanQuery();
+
+
+ //create BooleanQueries to hold the variants for each token/field pair and ensure it
+ // has no coord factor
+ //Step 1: sort the termqueries by term/field
+ HashMap variantQueries=new HashMap();
+ int size = q.size();
+ for(int i = 0; i < size; i++)
+ {
+ ScoreTerm st = (ScoreTerm) q.pop();
+ ArrayList l=(ArrayList) variantQueries.get(st.fuzziedSourceTerm);
+ if(l==null)
+ {
+ l=new ArrayList();
+ variantQueries.put(st.fuzziedSourceTerm,l);
+ }
+ l.add(st);
+ }
+ //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
+ for (Iterator iter = variantQueries.values().iterator(); iter.hasNext();)
+ {
+ ArrayList variants = (ArrayList) iter.next();
+ if(variants.size()==1)
+ {
+ //optimize where only one selected variant
+ ScoreTerm st=(ScoreTerm) variants.get(0);
+ TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
+ tq.setBoost(st.score); // set the boost to a mix of IDF and score
+ bq.add(tq, BooleanClause.Occur.SHOULD);
+ }
+ else
+ {
+ BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
+ for (Iterator iterator2 = variants.iterator(); iterator2
+ .hasNext();)
+ {
+ ScoreTerm st = (ScoreTerm) iterator2.next();
+ TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF); // found a match
+ tq.setBoost(st.score); // set the boost using the ScoreTerm's score
+ termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
+ }
+ bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
+ }
+ }
+ //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
+ // booleans with a minimum-should-match of NumFields-1?
+ bq.setBoost(getBoost());
+ this.rewrittenQuery=bq;
+ return bq;
+ }
+
+ //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
+ // term variants) then is reset with IDF for use in ranking against all other
+ // terms/fields
+ private static class ScoreTerm{
+ public Term term;
+ public float score;
+ Term fuzziedSourceTerm;
+
+ public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
+ this.term = term;
+ this.score = score;
+ this.fuzziedSourceTerm=fuzziedSourceTerm;
+ }
+ }
+
+ private static class ScoreTermQueue extends PriorityQueue {
+ public ScoreTermQueue(int size){
+ initialize(size);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
+ */
+ protected boolean lessThan(Object a, Object b) {
+ ScoreTerm termA = (ScoreTerm)a;
+ ScoreTerm termB = (ScoreTerm)b;
+ if (termA.score== termB.score)
+ return termA.term.compareTo(termB.term) > 0;
+ else
+ return termA.score < termB.score;
+ }
+
+ }
+
+ //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
+ private static class FuzzyTermQuery extends TermQuery
+ {
+ boolean ignoreTF;
+ public FuzzyTermQuery(Term t, boolean ignoreTF)
+ {
+ super(t);
+ this.ignoreTF=ignoreTF;
+ }
+ public Similarity getSimilarity(Searcher searcher)
+ {
+ Similarity result = super.getSimilarity(searcher);
+ result = new SimilarityDelegator(result) {
+
+ public float tf(float freq)
+ {
+ if(ignoreTF)
+ {
+ return 1; //ignore tf
+ }
+ return super.tf(freq);
+ }
+ public float idf(int docFreq, int numDocs)
+ {
+ //IDF is already factored into individual term boosts
+ return 1;
+ }
+ };
+ return result;
+ }
+ }
+
+
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.Query#toString(java.lang.String)
+ */
+ public String toString(String field)
+ {
+ return null;
+ }
+
+
+ public boolean isIgnoreTF()
+ {
+ return ignoreTF;
+ }
+
+
+ public void setIgnoreTF(boolean ignoreTF)
+ {
+ this.ignoreTF = ignoreTF;
+ }
+
+}
Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java?rev=712922&r1=712921&r2=712922&view=diff
==============================================================================
--- lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java (original)
+++ lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java Mon Nov 10 18:35:46 2008
@@ -1,130 +1,130 @@
-package org.apache.lucene.search;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.BitSet;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.util.OpenBitSet;
-
-/**
- * Constructs a filter for docs matching any of the terms added to this class.
- * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
- * a sequence. An example might be a collection of primary keys from a database query result or perhaps
- * a choice of "category" labels picked by the end user. As a filter, this is much faster than the
- * equivalent query (a BooleanQuery with many "should" TermQueries)
- *
- */
-public class TermsFilter extends Filter
-{
- Set terms=new TreeSet();
-
- /**
- * Adds a term to the list of acceptable terms
- * @param term
- */
- public void addTerm(Term term)
- {
- terms.add(term);
- }
-
-
-
- /* (non-Javadoc)
- * @see org.apache.lucene.search.Filter#bits(org.apache.lucene.index.IndexReader)
- */
- public BitSet bits(IndexReader reader) throws IOException
- {
- BitSet result=new BitSet(reader.maxDoc());
- TermDocs td = reader.termDocs();
- try
- {
- for (Iterator iter = terms.iterator(); iter.hasNext();)
- {
- Term term = (Term) iter.next();
- td.seek(term);
- while (td.next())
- {
- result.set(td.doc());
- }
- }
- }
- finally
- {
- td.close();
- }
- return result;
- }
-
-
-
-/* (non-Javadoc)
- * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
- */
- public DocIdSet getDocIdSet(IndexReader reader) throws IOException
- {
- OpenBitSet result=new OpenBitSet(reader.maxDoc());
- TermDocs td = reader.termDocs();
- try
- {
- for (Iterator iter = terms.iterator(); iter.hasNext();)
- {
- Term term = (Term) iter.next();
- td.seek(term);
- while (td.next())
- {
- result.set(td.doc());
- }
- }
- }
- finally
- {
- td.close();
- }
- return result;
- }
-
- public boolean equals(Object obj)
- {
- if(this == obj)
- return true;
- if((obj == null) || (obj.getClass() != this.getClass()))
- return false;
- TermsFilter test = (TermsFilter)obj;
- return (terms == test.terms ||
- (terms != null && terms.equals(test.terms)));
- }
-
- public int hashCode()
- {
- int hash=9;
- for (Iterator iter = terms.iterator(); iter.hasNext();)
- {
- Term term = (Term) iter.next();
- hash = 31 * hash + term.hashCode();
- }
- return hash;
- }
-
-}
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.util.OpenBitSet;
+
+/**
+ * Constructs a filter for docs matching any of the terms added to this class.
+ * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
+ * a sequence. An example might be a collection of primary keys from a database query result or perhaps
+ * a choice of "category" labels picked by the end user. As a filter, this is much faster than the
+ * equivalent query (a BooleanQuery with many "should" TermQueries)
+ *
+ */
+public class TermsFilter extends Filter
+{
+ Set terms=new TreeSet();
+
+ /**
+ * Adds a term to the list of acceptable terms
+ * @param term
+ */
+ public void addTerm(Term term)
+ {
+ terms.add(term);
+ }
+
+
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.Filter#bits(org.apache.lucene.index.IndexReader)
+ */
+ public BitSet bits(IndexReader reader) throws IOException
+ {
+ BitSet result=new BitSet(reader.maxDoc());
+ TermDocs td = reader.termDocs();
+ try
+ {
+ for (Iterator iter = terms.iterator(); iter.hasNext();)
+ {
+ Term term = (Term) iter.next();
+ td.seek(term);
+ while (td.next())
+ {
+ result.set(td.doc());
+ }
+ }
+ }
+ finally
+ {
+ td.close();
+ }
+ return result;
+ }
+
+
+
+/* (non-Javadoc)
+ * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
+ */
+ public DocIdSet getDocIdSet(IndexReader reader) throws IOException
+ {
+ OpenBitSet result=new OpenBitSet(reader.maxDoc());
+ TermDocs td = reader.termDocs();
+ try
+ {
+ for (Iterator iter = terms.iterator(); iter.hasNext();)
+ {
+ Term term = (Term) iter.next();
+ td.seek(term);
+ while (td.next())
+ {
+ result.set(td.doc());
+ }
+ }
+ }
+ finally
+ {
+ td.close();
+ }
+ return result;
+ }
+
+ public boolean equals(Object obj)
+ {
+ if(this == obj)
+ return true;
+ if((obj == null) || (obj.getClass() != this.getClass()))
+ return false;
+ TermsFilter test = (TermsFilter)obj;
+ return (terms == test.terms ||
+ (terms != null && terms.equals(test.terms)));
+ }
+
+ public int hashCode()
+ {
+ int hash=9;
+ for (Iterator iter = terms.iterator(); iter.hasNext();)
+ {
+ Term term = (Term) iter.next();
+ hash = 31 * hash + term.hashCode();
+ }
+ return hash;
+ }
+
+}
Propchange: lucene/java/trunk/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java
------------------------------------------------------------------------------
svn:eol-style = native