You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2011/05/18 05:58:49 UTC
svn commit: r1104680 - in /lucene/dev/branches/branch_3x/lucene: ./
contrib/grouping/src/java/org/apache/lucene/search/grouping/
contrib/grouping/src/test/org/apache/lucene/search/grouping/
src/java/org/apache/lucene/search/ src/test/org/apache/lucene/...
Author: shaie
Date: Wed May 18 03:58:49 2011
New Revision: 1104680
URL: http://svn.apache.org/viewvc?rev=1104680&view=rev
Log:
LUCENE-3102: add factory method to CachingCollector
Modified:
lucene/dev/branches/branch_3x/lucene/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html
lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java
Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Wed May 18 03:58:49 2011
@@ -49,6 +49,10 @@ New features
* LUCENE-3071: Adding ReversePathHierarchyTokenizer, added skip parameter to
PathHierarchyTokenizer (Olivier Favre via ryan)
+* LUCENE-1421, LUCENE-3102: added CachingCollector which allow you to cache
+ document IDs and scores encountered during the search, and "reply" them to
+ another Collector. (Mike McCandless, Shai Erera)
+
API Changes
* LUCENE-3061: IndexWriter's getNextMerge() and merge(OneMerge) are now public
Modified: lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html Wed May 18 03:58:49 2011
@@ -73,7 +73,7 @@ field fall into a single group.</p>
boolean cacheScores = true;
double maxCacheRAMMB = 4.0;
- CachingCollector cachedCollector = new CachingCollector(c1, cacheScores, maxCacheRAMMB);
+ CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
Modified: lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Wed May 18 03:58:49 2011
@@ -452,10 +452,10 @@ public class TestGrouping extends Lucene
}
if (doAllGroups) {
- cCache = new CachingCollector(c1, true, maxCacheMB);
+ cCache = CachingCollector.create(c1, true, maxCacheMB);
c = MultiCollector.wrap(cCache, groupCountCollector);
} else {
- c = cCache = new CachingCollector(c1, true, maxCacheMB);
+ c = cCache = CachingCollector.create(c1, true, maxCacheMB);
}
} else if (doAllGroups) {
c = MultiCollector.wrap(c1, groupCountCollector);
Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java Wed May 18 03:58:49 2011
@@ -47,7 +47,7 @@ import org.apache.lucene.util.RamUsageEs
*
* @lucene.experimental
*/
-public class CachingCollector extends Collector {
+public abstract class CachingCollector extends Collector {
// Max out at 512K arrays
private static final int MAX_ARRAY_SIZE = 512 * 1024;
@@ -66,7 +66,7 @@ public class CachingCollector extends Co
}
}
- private static class CachedScorer extends Scorer {
+ private static final class CachedScorer extends Scorer {
// NOTE: these members are package-private b/c that way accessing them from
// the outer class does not incur access check by the JVM. The same
@@ -78,135 +78,260 @@ public class CachingCollector extends Co
private CachedScorer() { super((Weight) null); }
@Override
- public float score() { return score; }
+ public final float score() { return score; }
@Override
- public int advance(int target) { throw new UnsupportedOperationException(); }
+ public final int advance(int target) { throw new UnsupportedOperationException(); }
@Override
- public int docID() { return doc; }
+ public final int docID() { return doc; }
@Override
- public float freq() { throw new UnsupportedOperationException(); }
+ public final float freq() { throw new UnsupportedOperationException(); }
@Override
- public int nextDoc() { throw new UnsupportedOperationException(); }
+ public final int nextDoc() { throw new UnsupportedOperationException(); }
}
- // TODO: would be nice if a collector defined a
- // needsScores() method so we can specialize / do checks
- // up front:
- private final Collector other;
- private final int maxDocsToCache;
-
- private final boolean cacheScores;
- private final CachedScorer cachedScorer;
- private final List<int[]> cachedDocs;
- private final List<float[]> cachedScores;
- private final List<SegStart> cachedSegs = new ArrayList<SegStart>();
-
- private Scorer scorer;
- private int[] curDocs;
- private float[] curScores;
- private int upto;
- private IndexReader lastReader;
- private int base;
- private int lastDocBase;
+ // A CachingCollector which caches scores
+ private static final class ScoreCachingCollector extends CachingCollector {
+
+ private final CachedScorer cachedScorer;
+ private final List<float[]> cachedScores;
+
+ private Scorer scorer;
+ private float[] curScores;
+
+ ScoreCachingCollector(Collector other, double maxRAMMB) {
+ super(other, maxRAMMB, true);
- public CachingCollector(Collector other, boolean cacheScores, double maxRAMMB) {
- this.other = other;
- this.cacheScores = cacheScores;
- if (cacheScores) {
cachedScorer = new CachedScorer();
cachedScores = new ArrayList<float[]>();
curScores = new float[128];
cachedScores.add(curScores);
- } else {
- cachedScorer = null;
- cachedScores = null;
}
- cachedDocs = new ArrayList<int[]>();
- curDocs = new int[INITIAL_ARRAY_SIZE];
- cachedDocs.add(curDocs);
+
+ @Override
+ public void collect(int doc) throws IOException {
- int bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
- if (cacheScores) {
- bytesPerDoc += RamUsageEstimator.NUM_BYTES_FLOAT;
+ if (curDocs == null) {
+ // Cache was too large
+ cachedScorer.score = scorer.score();
+ cachedScorer.doc = doc;
+ other.collect(doc);
+ return;
+ }
+
+ // Allocate a bigger array or abort caching
+ if (upto == curDocs.length) {
+ base += upto;
+
+ // Compute next array length - don't allocate too big arrays
+ int nextLength = 8*curDocs.length;
+ if (nextLength > MAX_ARRAY_SIZE) {
+ nextLength = MAX_ARRAY_SIZE;
+ }
+
+ if (base + nextLength > maxDocsToCache) {
+ // try to allocate a smaller array
+ nextLength = maxDocsToCache - base;
+ if (nextLength <= 0) {
+ // Too many docs to collect -- clear cache
+ curDocs = null;
+ curScores = null;
+ cachedSegs.clear();
+ cachedDocs.clear();
+ cachedScores.clear();
+ cachedScorer.score = scorer.score();
+ cachedScorer.doc = doc;
+ other.collect(doc);
+ return;
+ }
+ }
+
+ curDocs = new int[nextLength];
+ cachedDocs.add(curDocs);
+ curScores = new float[nextLength];
+ cachedScores.add(curScores);
+ upto = 0;
+ }
+
+ curDocs[upto] = doc;
+ cachedScorer.score = curScores[upto] = scorer.score();
+ upto++;
+ cachedScorer.doc = doc;
+ other.collect(doc);
+ }
+
+ @Override
+ public void replay(Collector other) throws IOException {
+ replayInit(other);
+
+ int curUpto = 0;
+ int curBase = 0;
+ int chunkUpto = 0;
+ other.setScorer(cachedScorer);
+ curDocs = EMPTY_INT_ARRAY;
+ for (SegStart seg : cachedSegs) {
+ other.setNextReader(seg.reader, seg.base);
+ while (curBase + curUpto < seg.end) {
+ if (curUpto == curDocs.length) {
+ curBase += curDocs.length;
+ curDocs = cachedDocs.get(chunkUpto);
+ curScores = cachedScores.get(chunkUpto);
+ chunkUpto++;
+ curUpto = 0;
+ }
+ cachedScorer.score = curScores[curUpto];
+ other.collect(curDocs[curUpto++]);
+ }
+ }
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ this.scorer = scorer;
+ other.setScorer(cachedScorer);
+ }
+
+ @Override
+ public String toString() {
+ if (isCached()) {
+ return "CachingCollector (" + (base+upto) + " docs & scores cached)";
+ } else {
+ return "CachingCollector (cache was cleared)";
+ }
}
- maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
- }
-
- @Override
- public void setScorer(Scorer scorer) throws IOException {
- this.scorer = scorer;
- other.setScorer(cachedScorer);
- }
- @Override
- public boolean acceptsDocsOutOfOrder() {
- return other.acceptsDocsOutOfOrder();
}
- @Override
- public void collect(int doc) throws IOException {
+ // A CachingCollector which does not cache scores
+ private static final class NoScoreCachingCollector extends CachingCollector {
+
+ NoScoreCachingCollector(Collector other, double maxRAMMB) {
+ super(other, maxRAMMB, false);
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
- if (curDocs == null) {
- // Cache was too large
- if (cacheScores) {
- cachedScorer.score = scorer.score();
+ if (curDocs == null) {
+ // Cache was too large
+ other.collect(doc);
+ return;
}
- cachedScorer.doc = doc;
+
+ // Allocate a bigger array or abort caching
+ if (upto == curDocs.length) {
+ base += upto;
+
+ // Compute next array length - don't allocate too big arrays
+ int nextLength = 8*curDocs.length;
+ if (nextLength > MAX_ARRAY_SIZE) {
+ nextLength = MAX_ARRAY_SIZE;
+ }
+
+ if (base + nextLength > maxDocsToCache) {
+ // try to allocate a smaller array
+ nextLength = maxDocsToCache - base;
+ if (nextLength <= 0) {
+ // Too many docs to collect -- clear cache
+ curDocs = null;
+ cachedSegs.clear();
+ cachedDocs.clear();
+ other.collect(doc);
+ return;
+ }
+ }
+
+ curDocs = new int[nextLength];
+ cachedDocs.add(curDocs);
+ upto = 0;
+ }
+
+ curDocs[upto] = doc;
+ upto++;
other.collect(doc);
- return;
}
- // Allocate a bigger array or abort caching
- if (upto == curDocs.length) {
- base += upto;
+ @Override
+ public void replay(Collector other) throws IOException {
+ replayInit(other);
- // Compute next array length - don't allocate too big arrays
- int nextLength = 8*curDocs.length;
- if (nextLength > MAX_ARRAY_SIZE) {
- nextLength = MAX_ARRAY_SIZE;
- }
-
- if (base + nextLength > maxDocsToCache) {
- // try to allocate a smaller array
- nextLength = maxDocsToCache - base;
- if (nextLength <= 0) {
- // Too many docs to collect -- clear cache
- curDocs = null;
- curScores = null;
- cachedSegs.clear();
- cachedDocs.clear();
- cachedScores.clear();
- if (cacheScores) {
- cachedScorer.score = scorer.score();
+ int curUpto = 0;
+ int curbase = 0;
+ int chunkUpto = 0;
+ curDocs = EMPTY_INT_ARRAY;
+ for (SegStart seg : cachedSegs) {
+ other.setNextReader(seg.reader, seg.base);
+ while (curbase + curUpto < seg.end) {
+ if (curUpto == curDocs.length) {
+ curbase += curDocs.length;
+ curDocs = cachedDocs.get(chunkUpto);
+ chunkUpto++;
+ curUpto = 0;
}
- cachedScorer.doc = doc;
- other.collect(doc);
- return;
+ other.collect(curDocs[curUpto++]);
}
}
-
- curDocs = new int[nextLength];
- cachedDocs.add(curDocs);
- if (cacheScores) {
- curScores = new float[nextLength];
- cachedScores.add(curScores);
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ other.setScorer(scorer);
+ }
+
+ @Override
+ public String toString() {
+ if (isCached()) {
+ return "CachingCollector (" + (base+upto) + " docs cached)";
+ } else {
+ return "CachingCollector (cache was cleared)";
}
- upto = 0;
}
-
- curDocs[upto] = doc;
- // TODO: maybe specialize private subclass so we don't
- // null check per collect...
+
+ }
+
+ // TODO: would be nice if a collector defined a
+ // needsScores() method so we can specialize / do checks
+ // up front. This is only relevant for the ScoreCaching
+ // version -- if the wrapped Collector does not need
+ // scores, it can avoid cachedScorer entirely.
+ protected final Collector other;
+
+ protected final int maxDocsToCache;
+ protected final List<SegStart> cachedSegs = new ArrayList<SegStart>();
+ protected final List<int[]> cachedDocs;
+
+ private IndexReader lastReader;
+
+ protected int[] curDocs;
+ protected int upto;
+ protected int base;
+ protected int lastDocBase;
+
+ public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
+ return cacheScores ? new ScoreCachingCollector(other, maxRAMMB) : new NoScoreCachingCollector(other, maxRAMMB);
+ }
+
+ // Prevent extension from non-internal classes
+ private CachingCollector(Collector other, double maxRAMMB, boolean cacheScores) {
+ this.other = other;
+
+ cachedDocs = new ArrayList<int[]>();
+ curDocs = new int[INITIAL_ARRAY_SIZE];
+ cachedDocs.add(curDocs);
+
+ int bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
if (cacheScores) {
- cachedScorer.score = curScores[upto] = scorer.score();
+ bytesPerDoc += RamUsageEstimator.NUM_BYTES_FLOAT;
}
- upto++;
- cachedScorer.doc = doc;
- other.collect(doc);
+ maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return other.acceptsDocsOutOfOrder();
}
public boolean isCached() {
@@ -223,26 +348,8 @@ public class CachingCollector extends Co
lastReader = reader;
}
- @Override
- public String toString() {
- if (isCached()) {
- return "CachingCollector (" + (base+upto) + " docs " + (cacheScores ? " & scores" : "") + " cached)";
- } else {
- return "CachingCollector (cache was cleared)";
- }
- }
-
- /**
- * Replays the cached doc IDs (and scores) to the given Collector.
- *
- * @throws IllegalStateException
- * if this collector is not cached (i.e., if the RAM limits were too
- * low for the number of documents + scores to cache).
- * @throws IllegalArgumentException
- * if the given Collect's does not support out-of-order collection,
- * while the collector passed to the ctor does.
- */
- public void replay(Collector other) throws IOException {
+ /** Reused by the specialized inner classes. */
+ void replayInit(Collector other) {
if (!isCached()) {
throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
}
@@ -259,29 +366,20 @@ public class CachingCollector extends Co
cachedSegs.add(new SegStart(lastReader, lastDocBase, base+upto));
lastReader = null;
}
-
- int curupto = 0;
- int curbase = 0;
- int chunkUpto = 0;
- other.setScorer(cachedScorer);
- curDocs = EMPTY_INT_ARRAY;
- for (SegStart seg : cachedSegs) {
- other.setNextReader(seg.reader, seg.base);
- while (curbase + curupto < seg.end) {
- if (curupto == curDocs.length) {
- curbase += curDocs.length;
- curDocs = cachedDocs.get(chunkUpto);
- if (cacheScores) {
- curScores = cachedScores.get(chunkUpto);
- }
- chunkUpto++;
- curupto = 0;
- }
- if (cacheScores) {
- cachedScorer.score = curScores[curupto];
- }
- other.collect(curDocs[curupto++]);
- }
- }
}
+
+ /**
+ * Replays the cached doc IDs (and scores) to the given Collector. If this
+ * instance does not cache scores, then Scorer is not set on
+ * {@code other.setScorer} as well as scores are not replayed.
+ *
+ * @throws IllegalStateException
+ * if this collector is not cached (i.e., if the RAM limits were too
+ * low for the number of documents + scores to cache).
+ * @throws IllegalArgumentException
+ * if the given Collect's does not support out-of-order collection,
+ * while the collector passed to the ctor does.
+ */
+ public abstract void replay(Collector other) throws IOException;
+
}
Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java Wed May 18 03:58:49 2011
@@ -75,39 +75,41 @@ public class TestCachingCollector extend
}
public void testBasic() throws Exception {
- CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 1);
- cc.setScorer(new MockScorer());
-
- // collect 1000 docs
- for (int i = 0; i < 1000; i++) {
- cc.collect(i);
- }
-
- // now replay them
- cc.replay(new Collector() {
- int prevDocID = -1;
-
- @Override
- public void setScorer(Scorer scorer) throws IOException {}
-
- @Override
- public void setNextReader(IndexReader reader, int docBase) throws IOException {}
+ for (boolean cacheScores : new boolean[] { false, true }) {
+ CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1);
+ cc.setScorer(new MockScorer());
- @Override
- public void collect(int doc) throws IOException {
- assertEquals(prevDocID + 1, doc);
- prevDocID = doc;
+ // collect 1000 docs
+ for (int i = 0; i < 1000; i++) {
+ cc.collect(i);
}
- @Override
- public boolean acceptsDocsOutOfOrder() {
- return false;
- }
- });
+ // now replay them
+ cc.replay(new Collector() {
+ int prevDocID = -1;
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {}
+
+ @Override
+ public void setNextReader(IndexReader reader, int docBase) throws IOException {}
+
+ @Override
+ public void collect(int doc) throws IOException {
+ assertEquals(prevDocID + 1, doc);
+ prevDocID = doc;
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+ });
+ }
}
public void testIllegalStateOnReplay() throws Exception {
- CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 50 * ONE_BYTE);
+ CachingCollector cc = CachingCollector.create(new NoOpCollector(false), true, 50 * ONE_BYTE);
cc.setScorer(new MockScorer());
// collect 130 docs, this should be enough for triggering cache abort.
@@ -130,14 +132,14 @@ public class TestCachingCollector extend
// is valid with the Collector passed to the ctor
// 'src' Collector does not support out-of-order
- CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 50 * ONE_BYTE);
+ CachingCollector cc = CachingCollector.create(new NoOpCollector(false), true, 50 * ONE_BYTE);
cc.setScorer(new MockScorer());
for (int i = 0; i < 10; i++) cc.collect(i);
cc.replay(new NoOpCollector(true)); // this call should not fail
cc.replay(new NoOpCollector(false)); // this call should not fail
// 'src' Collector supports out-of-order
- cc = new CachingCollector(new NoOpCollector(true), true, 50 * ONE_BYTE);
+ cc = CachingCollector.create(new NoOpCollector(true), true, 50 * ONE_BYTE);
cc.setScorer(new MockScorer());
for (int i = 0; i < 10; i++) cc.collect(i);
cc.replay(new NoOpCollector(true)); // this call should not fail
@@ -156,14 +158,18 @@ public class TestCachingCollector extend
// set RAM limit enough for 150 docs + random(10000)
int numDocs = random.nextInt(10000) + 150;
- CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 8 * ONE_BYTE * numDocs);
- cc.setScorer(new MockScorer());
- for (int i = 0; i < numDocs; i++) cc.collect(i);
- assertTrue(cc.isCached());
-
- // The 151's document should terminate caching
- cc.collect(numDocs);
- assertFalse(cc.isCached());
+ for (boolean cacheScores : new boolean[] { false, true }) {
+ int bytesPerDoc = cacheScores ? 8 : 4;
+ CachingCollector cc = CachingCollector.create(new NoOpCollector(false),
+ cacheScores, bytesPerDoc * ONE_BYTE * numDocs);
+ cc.setScorer(new MockScorer());
+ for (int i = 0; i < numDocs; i++) cc.collect(i);
+ assertTrue(cc.isCached());
+
+ // The 151's document should terminate caching
+ cc.collect(numDocs);
+ assertFalse(cc.isCached());
+ }
}
}