You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2009/01/28 22:05:27 UTC
svn commit: r738622 - in /lucene/java/trunk: CHANGES.txt
src/java/org/apache/lucene/search/FieldCacheTermsFilter.java
src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java
Author: mikemccand
Date: Wed Jan 28 21:05:26 2009
New Revision: 738622
URL: http://svn.apache.org/viewvc?rev=738622&view=rev
Log:
LUCENE-1487: add FieldCacheTermsFilter, to filter by multiple terms on single-valued fields
Added:
lucene/java/trunk/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java (with props)
lucene/java/trunk/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java (with props)
Modified:
lucene/java/trunk/CHANGES.txt
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=738622&r1=738621&r2=738622&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed Jan 28 21:05:26 2009
@@ -1,4 +1,4 @@
-Lucene Change Log
+Lucene Change Log
$Id$
======================= Trunk (not yet released) =======================
@@ -136,6 +136,12 @@
11. LUCENE-1528: Add support for Ideographic Space to the queryparser.
(Luis Alves via Michael Busch)
+12. LUCENE-1487: Added FieldCacheTermsFilter, to filter by multiple
+ terms on single-valued fields. The filter loads the FieldCache
+ for the field the first time it's called, and subsequent usage of
+ that field, even with different Terms in the filter, are fast.
+ (Tim Sturge, Shalin Shekhar Mangar via Mike McCandless).
+
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
Added: lucene/java/trunk/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java?rev=738622&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java Wed Jan 28 21:05:26 2009
@@ -0,0 +1,117 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.util.OpenBitSet;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * A term filter built on top of a cached single field (in FieldCache). It can be used only
+ * with single-valued fields.
+ * <p/>
+ * FieldCacheTermsFilter builds a single cache for the field the first time it is used. Each
+ * subsequent FieldCacheTermsFilter on the same field then re-uses this cache even if the terms
+ * themselves are different.
+ * <p/>
+ * The FieldCacheTermsFilter is faster than building a TermsFilter each time.
+ * FieldCacheTermsFilter are fast to build in cases where number of documents are far more than
+ * unique terms. Internally, it creates a BitSet by term number and scans by document id.
+ * <p/>
+ * As with all FieldCache based functionality, FieldCacheTermsFilter is only valid for fields
+ * which contain zero or one terms for each document. Thus it works on dates, prices and other
+ * single value fields but will not work on regular text fields. It is preferable to use an
+ * NOT_ANALYZED field to ensure that there is only a single term.
+ * <p/>
+ * Also, collation is performed at the time the FieldCache is built; to change collation you
+ * need to override the getFieldCache() method to change the underlying cache.
+ */
+public class FieldCacheTermsFilter extends Filter {
+ private String field;
+ private Iterable terms;
+
+ public FieldCacheTermsFilter(String field, Iterable terms) {
+ this.field = field;
+ this.terms = terms;
+ }
+
+ public FieldCache getFieldCache() {
+ return FieldCache.DEFAULT;
+ }
+
+ public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+ return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
+ }
+
+ protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
+ private FieldCache.StringIndex fcsi;
+
+ private OpenBitSet openBitSet;
+
+ public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
+ this.fcsi = fcsi;
+ openBitSet = new OpenBitSet(this.fcsi.lookup.length);
+ for (Iterator it = terms.iterator(); it.hasNext();) {
+ Object term = it.next();
+ int termNumber = this.fcsi.binarySearchLookup((String) term);
+ if (termNumber > 0) {
+ openBitSet.fastSet(termNumber);
+ }
+ }
+ }
+
+ public DocIdSetIterator iterator() {
+ return new FieldCacheTermsFilterDocIdSetIterator();
+ }
+
+ protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
+ private int doc = -1;
+
+ public int doc() {
+ return doc;
+ }
+
+ public boolean next() {
+ try {
+ do {
+ doc++;
+ } while (!openBitSet.fastGet(fcsi.order[doc]));
+ return true;
+ } catch (ArrayIndexOutOfBoundsException e) {
+ doc = Integer.MAX_VALUE;
+ return false;
+ }
+ }
+
+ public boolean skipTo(int target) {
+ try {
+ doc = target;
+ while (!openBitSet.fastGet(fcsi.order[doc])) {
+ doc++;
+ }
+ return true;
+ } catch (ArrayIndexOutOfBoundsException e) {
+ doc = Integer.MAX_VALUE;
+ return false;
+ }
+ }
+ }
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java?rev=738622&view=auto
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java (added)
+++ lucene/java/trunk/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java Wed Jan 28 21:05:26 2009
@@ -0,0 +1,75 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.MockRAMDirectory;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A basic unit test for FieldCacheTermsFilter
+ *
+ * @see org.apache.lucene.search.FieldCacheTermsFilter
+ */
+public class TestFieldCacheTermsFilter extends TestCase {
+ public void testMissingTerms() throws Exception {
+ String fieldName = "field1";
+ MockRAMDirectory rd = new MockRAMDirectory();
+ IndexWriter w = new IndexWriter(rd, new KeywordAnalyzer(), MaxFieldLength.UNLIMITED);
+ for (int i = 0; i < 100; i++) {
+ Document doc = new Document();
+ int term = i * 10; //terms are units of 10;
+ doc.add(new Field(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
+ w.addDocument(doc);
+ }
+ w.close();
+
+ IndexReader reader = IndexReader.open(rd);
+ IndexSearcher searcher = new IndexSearcher(reader);
+ int numDocs = reader.numDocs();
+ ScoreDoc[] results;
+ MatchAllDocsQuery q = new MatchAllDocsQuery();
+
+ List terms = new ArrayList();
+ terms.add("5");
+ results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
+ assertEquals("Must match nothing", 0, results.length);
+
+ terms = new ArrayList();
+ terms.add("10");
+ results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
+ assertEquals("Must match 1", 1, results.length);
+
+ terms = new ArrayList();
+ terms.add("10");
+ terms.add("20");
+ results = searcher.search(q, new FieldCacheTermsFilter(fieldName, terms), numDocs).scoreDocs;
+ assertEquals("Must match 2", 2, results.length);
+
+ reader.close();
+ rd.close();
+ }
+}
Propchange: lucene/java/trunk/src/test/org/apache/lucene/search/TestFieldCacheTermsFilter.java
------------------------------------------------------------------------------
svn:eol-style = native