You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2011/05/18 05:58:49 UTC
svn commit: r1104680 - in /lucene/dev/branches/branch_3x/lucene: ./ contrib/grouping/src/java/org/apache/lucene/search/grouping/ contrib/grouping/src/test/org/apache/lucene/search/grouping/ src/java/org/apache/lucene/search/ src/test/org/apache/lucene/...

Author: shaie
Date: Wed May 18 03:58:49 2011
New Revision: 1104680

URL: http://svn.apache.org/viewvc?rev=1104680&view=rev
Log:
LUCENE-3102: add factory method to CachingCollector

Modified:
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html
    lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Wed May 18 03:58:49 2011
@@ -49,6 +49,10 @@ New features
 * LUCENE-3071: Adding ReversePathHierarchyTokenizer, added skip parameter to 
   PathHierarchyTokenizer (Olivier Favre via ryan)
 
+* LUCENE-1421, LUCENE-3102: added CachingCollector which allow you to cache 
+  document IDs and scores encountered during the search, and "reply" them to 
+  another Collector. (Mike McCandless, Shai Erera)
+  
 API Changes
 
 * LUCENE-3061: IndexWriter's getNextMerge() and merge(OneMerge) are now public

Modified: lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/java/org/apache/lucene/search/grouping/package.html Wed May 18 03:58:49 2011
@@ -73,7 +73,7 @@ field fall into a single group.</p>
 
   boolean cacheScores = true;
   double maxCacheRAMMB = 4.0;
-  CachingCollector cachedCollector = new CachingCollector(c1, cacheScores, maxCacheRAMMB);
+  CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
   s.search(new TermQuery(new Term("content", searchTerm)), cachedCollector);
 
   Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);

Modified: lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Wed May 18 03:58:49 2011
@@ -452,10 +452,10 @@ public class TestGrouping extends Lucene
           }
 
           if (doAllGroups) {
-            cCache = new CachingCollector(c1, true, maxCacheMB);
+            cCache = CachingCollector.create(c1, true, maxCacheMB);
             c = MultiCollector.wrap(cCache, groupCountCollector);
           } else {
-            c = cCache = new CachingCollector(c1, true, maxCacheMB);
+            c = cCache = CachingCollector.create(c1, true, maxCacheMB);
           }
         } else if (doAllGroups) {
           c = MultiCollector.wrap(c1, groupCountCollector);

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/search/CachingCollector.java Wed May 18 03:58:49 2011
@@ -47,7 +47,7 @@ import org.apache.lucene.util.RamUsageEs
  *
  * @lucene.experimental
  */
-public class CachingCollector extends Collector {
+public abstract class CachingCollector extends Collector {
   
   // Max out at 512K arrays
   private static final int MAX_ARRAY_SIZE = 512 * 1024;
@@ -66,7 +66,7 @@ public class CachingCollector extends Co
     }
   }
   
-  private static class CachedScorer extends Scorer {
+  private static final class CachedScorer extends Scorer {
 
     // NOTE: these members are package-private b/c that way accessing them from
     // the outer class does not incur access check by the JVM. The same
@@ -78,135 +78,260 @@ public class CachingCollector extends Co
     private CachedScorer() { super((Weight) null); }
 
     @Override
-    public float score() { return score; }
+    public final float score() { return score; }
 
     @Override
-    public int advance(int target) { throw new UnsupportedOperationException(); }
+    public final int advance(int target) { throw new UnsupportedOperationException(); }
 
     @Override
-    public int docID() { return doc; }
+    public final int docID() { return doc; }
 
     @Override
-    public float freq() { throw new UnsupportedOperationException(); }
+    public final float freq() { throw new UnsupportedOperationException(); }
 
     @Override
-    public int nextDoc() { throw new UnsupportedOperationException(); }
+    public final int nextDoc() { throw new UnsupportedOperationException(); }
   }
 
-  // TODO: would be nice if a collector defined a
-  // needsScores() method so we can specialize / do checks
-  // up front:
-  private final Collector other;
-  private final int maxDocsToCache;
-
-  private final boolean cacheScores;
-  private final CachedScorer cachedScorer;
-  private final List<int[]> cachedDocs;
-  private final List<float[]> cachedScores;
-  private final List<SegStart> cachedSegs = new ArrayList<SegStart>();
-
-  private Scorer scorer;
-  private int[] curDocs;
-  private float[] curScores;
-  private int upto;
-  private IndexReader lastReader;
-  private int base;
-  private int lastDocBase;
+  // A CachingCollector which caches scores
+  private static final class ScoreCachingCollector extends CachingCollector {
+
+    private final CachedScorer cachedScorer;
+    private final List<float[]> cachedScores;
+
+    private Scorer scorer;
+    private float[] curScores;
+
+    ScoreCachingCollector(Collector other, double maxRAMMB) {
+      super(other, maxRAMMB, true);
 
-  public CachingCollector(Collector other, boolean cacheScores, double maxRAMMB) {
-    this.other = other;
-    this.cacheScores = cacheScores;
-    if (cacheScores) {
       cachedScorer = new CachedScorer();
       cachedScores = new ArrayList<float[]>();
       curScores = new float[128];
       cachedScores.add(curScores);
-    } else {
-      cachedScorer = null;
-      cachedScores = null;
     }
-    cachedDocs = new ArrayList<int[]>();
-    curDocs = new int[INITIAL_ARRAY_SIZE];
-    cachedDocs.add(curDocs);
+    
+    @Override
+    public void collect(int doc) throws IOException {
 
-    int bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
-    if (cacheScores) {
-      bytesPerDoc += RamUsageEstimator.NUM_BYTES_FLOAT;
+      if (curDocs == null) {
+        // Cache was too large
+        cachedScorer.score = scorer.score();
+        cachedScorer.doc = doc;
+        other.collect(doc);
+        return;
+      }
+
+      // Allocate a bigger array or abort caching
+      if (upto == curDocs.length) {
+        base += upto;
+        
+        // Compute next array length - don't allocate too big arrays
+        int nextLength = 8*curDocs.length;
+        if (nextLength > MAX_ARRAY_SIZE) {
+          nextLength = MAX_ARRAY_SIZE;
+        }
+
+        if (base + nextLength > maxDocsToCache) {
+          // try to allocate a smaller array
+          nextLength = maxDocsToCache - base;
+          if (nextLength <= 0) {
+            // Too many docs to collect -- clear cache
+            curDocs = null;
+            curScores = null;
+            cachedSegs.clear();
+            cachedDocs.clear();
+            cachedScores.clear();
+            cachedScorer.score = scorer.score();
+            cachedScorer.doc = doc;
+            other.collect(doc);
+            return;
+          }
+        }
+        
+        curDocs = new int[nextLength];
+        cachedDocs.add(curDocs);
+        curScores = new float[nextLength];
+        cachedScores.add(curScores);
+        upto = 0;
+      }
+      
+      curDocs[upto] = doc;
+      cachedScorer.score = curScores[upto] = scorer.score();
+      upto++;
+      cachedScorer.doc = doc;
+      other.collect(doc);
+    }
+
+    @Override
+    public void replay(Collector other) throws IOException {
+      replayInit(other);
+      
+      int curUpto = 0;
+      int curBase = 0;
+      int chunkUpto = 0;
+      other.setScorer(cachedScorer);
+      curDocs = EMPTY_INT_ARRAY;
+      for (SegStart seg : cachedSegs) {
+        other.setNextReader(seg.reader, seg.base);
+        while (curBase + curUpto < seg.end) {
+          if (curUpto == curDocs.length) {
+            curBase += curDocs.length;
+            curDocs = cachedDocs.get(chunkUpto);
+            curScores = cachedScores.get(chunkUpto);
+            chunkUpto++;
+            curUpto = 0;
+          }
+          cachedScorer.score = curScores[curUpto];
+          other.collect(curDocs[curUpto++]);
+        }
+      }
+    }
+
+    @Override
+    public void setScorer(Scorer scorer) throws IOException {
+      this.scorer = scorer;
+      other.setScorer(cachedScorer);
+    }
+
+    @Override
+    public String toString() {
+      if (isCached()) {
+        return "CachingCollector (" + (base+upto) + " docs & scores cached)";
+      } else {
+        return "CachingCollector (cache was cleared)";
+      }
     }
-    maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
-  }
-  
-  @Override
-  public void setScorer(Scorer scorer) throws IOException {
-    this.scorer = scorer;
-    other.setScorer(cachedScorer);
-  }
 
-  @Override
-  public boolean acceptsDocsOutOfOrder() {
-    return other.acceptsDocsOutOfOrder();
   }
 
-  @Override
-  public void collect(int doc) throws IOException {
+  // A CachingCollector which does not cache scores
+  private static final class NoScoreCachingCollector extends CachingCollector {
+    
+    NoScoreCachingCollector(Collector other, double maxRAMMB) {
+     super(other, maxRAMMB, false);
+    }
+    
+    @Override
+    public void collect(int doc) throws IOException {
 
-    if (curDocs == null) {
-      // Cache was too large
-      if (cacheScores) {
-        cachedScorer.score = scorer.score();
+      if (curDocs == null) {
+        // Cache was too large
+        other.collect(doc);
+        return;
       }
-      cachedScorer.doc = doc;
+
+      // Allocate a bigger array or abort caching
+      if (upto == curDocs.length) {
+        base += upto;
+        
+        // Compute next array length - don't allocate too big arrays
+        int nextLength = 8*curDocs.length;
+        if (nextLength > MAX_ARRAY_SIZE) {
+          nextLength = MAX_ARRAY_SIZE;
+        }
+
+        if (base + nextLength > maxDocsToCache) {
+          // try to allocate a smaller array
+          nextLength = maxDocsToCache - base;
+          if (nextLength <= 0) {
+            // Too many docs to collect -- clear cache
+            curDocs = null;
+            cachedSegs.clear();
+            cachedDocs.clear();
+            other.collect(doc);
+            return;
+          }
+        }
+        
+        curDocs = new int[nextLength];
+        cachedDocs.add(curDocs);
+        upto = 0;
+      }
+      
+      curDocs[upto] = doc;
+      upto++;
       other.collect(doc);
-      return;
     }
 
-    // Allocate a bigger array or abort caching
-    if (upto == curDocs.length) {
-      base += upto;
+    @Override
+    public void replay(Collector other) throws IOException {
+      replayInit(other);
       
-      // Compute next array length - don't allocate too big arrays
-      int nextLength = 8*curDocs.length;
-      if (nextLength > MAX_ARRAY_SIZE) {
-        nextLength = MAX_ARRAY_SIZE;
-      }
-
-      if (base + nextLength > maxDocsToCache) {
-        // try to allocate a smaller array
-        nextLength = maxDocsToCache - base;
-        if (nextLength <= 0) {
-          // Too many docs to collect -- clear cache
-          curDocs = null;
-          curScores = null;
-          cachedSegs.clear();
-          cachedDocs.clear();
-          cachedScores.clear();
-          if (cacheScores) {
-            cachedScorer.score = scorer.score();
+      int curUpto = 0;
+      int curbase = 0;
+      int chunkUpto = 0;
+      curDocs = EMPTY_INT_ARRAY;
+      for (SegStart seg : cachedSegs) {
+        other.setNextReader(seg.reader, seg.base);
+        while (curbase + curUpto < seg.end) {
+          if (curUpto == curDocs.length) {
+            curbase += curDocs.length;
+            curDocs = cachedDocs.get(chunkUpto);
+            chunkUpto++;
+            curUpto = 0;
           }
-          cachedScorer.doc = doc;
-          other.collect(doc);
-          return;
+          other.collect(curDocs[curUpto++]);
         }
       }
-      
-      curDocs = new int[nextLength];
-      cachedDocs.add(curDocs);
-      if (cacheScores) {
-        curScores = new float[nextLength];
-        cachedScores.add(curScores);
+    }
+
+    @Override
+    public void setScorer(Scorer scorer) throws IOException {
+      other.setScorer(scorer);
+    }
+
+    @Override
+    public String toString() {
+      if (isCached()) {
+        return "CachingCollector (" + (base+upto) + " docs cached)";
+      } else {
+        return "CachingCollector (cache was cleared)";
       }
-      upto = 0;
     }
-    
-    curDocs[upto] = doc;
-    // TODO: maybe specialize private subclass so we don't
-    // null check per collect...
+
+  }
+
+  // TODO: would be nice if a collector defined a
+  // needsScores() method so we can specialize / do checks
+  // up front. This is only relevant for the ScoreCaching 
+  // version -- if the wrapped Collector does not need 
+  // scores, it can avoid cachedScorer entirely.
+  protected final Collector other;
+
+  protected final int maxDocsToCache;
+  protected final List<SegStart> cachedSegs = new ArrayList<SegStart>();
+  protected final List<int[]> cachedDocs;
+
+  private IndexReader lastReader;
+  
+  protected int[] curDocs;
+  protected int upto;
+  protected int base;
+  protected int lastDocBase;
+
+  public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
+    return cacheScores ? new ScoreCachingCollector(other, maxRAMMB) : new NoScoreCachingCollector(other, maxRAMMB);
+  }
+  
+  // Prevent extension from non-internal classes
+  private CachingCollector(Collector other, double maxRAMMB, boolean cacheScores) {
+    this.other = other;
+
+    cachedDocs = new ArrayList<int[]>();
+    curDocs = new int[INITIAL_ARRAY_SIZE];
+    cachedDocs.add(curDocs);
+
+    int bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
     if (cacheScores) {
-      cachedScorer.score = curScores[upto] = scorer.score();
+      bytesPerDoc += RamUsageEstimator.NUM_BYTES_FLOAT;
     }
-    upto++;
-    cachedScorer.doc = doc;
-    other.collect(doc);
+    maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
+  }
+
+  @Override
+  public boolean acceptsDocsOutOfOrder() {
+    return other.acceptsDocsOutOfOrder();
   }
 
   public boolean isCached() {
@@ -223,26 +348,8 @@ public class CachingCollector extends Co
     lastReader = reader;
   }
 
-  @Override
-  public String toString() {
-    if (isCached()) {
-      return "CachingCollector (" + (base+upto) + " docs " + (cacheScores ? " & scores" : "") + " cached)";
-    } else {
-      return "CachingCollector (cache was cleared)";
-    }
-  }
-
-  /**
-   * Replays the cached doc IDs (and scores) to the given Collector.
-   * 
-   * @throws IllegalStateException
-   *           if this collector is not cached (i.e., if the RAM limits were too
-   *           low for the number of documents + scores to cache).
-   * @throws IllegalArgumentException
-   *           if the given Collect's does not support out-of-order collection,
-   *           while the collector passed to the ctor does.
-   */
-  public void replay(Collector other) throws IOException {
+  /** Reused by the specialized inner classes. */
+  void replayInit(Collector other) {
     if (!isCached()) {
       throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
     }
@@ -259,29 +366,20 @@ public class CachingCollector extends Co
       cachedSegs.add(new SegStart(lastReader, lastDocBase, base+upto));
       lastReader = null;
     }
-    
-    int curupto = 0;
-    int curbase = 0;
-    int chunkUpto = 0;
-    other.setScorer(cachedScorer);
-    curDocs = EMPTY_INT_ARRAY;
-    for (SegStart seg : cachedSegs) {
-      other.setNextReader(seg.reader, seg.base);
-      while (curbase + curupto < seg.end) {
-        if (curupto == curDocs.length) {
-          curbase += curDocs.length;
-          curDocs = cachedDocs.get(chunkUpto);
-          if (cacheScores) {
-            curScores = cachedScores.get(chunkUpto);
-          }
-          chunkUpto++;
-          curupto = 0;
-        }
-        if (cacheScores) {
-          cachedScorer.score = curScores[curupto];
-        }
-        other.collect(curDocs[curupto++]);
-      }
-    }
   }
+
+  /**
+   * Replays the cached doc IDs (and scores) to the given Collector. If this
+   * instance does not cache scores, then Scorer is not set on
+   * {@code other.setScorer} as well as scores are not replayed.
+   * 
+   * @throws IllegalStateException
+   *           if this collector is not cached (i.e., if the RAM limits were too
+   *           low for the number of documents + scores to cache).
+   * @throws IllegalArgumentException
+   *           if the given Collect's does not support out-of-order collection,
+   *           while the collector passed to the ctor does.
+   */
+  public abstract void replay(Collector other) throws IOException;
+  
 }

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java?rev=1104680&r1=1104679&r2=1104680&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java Wed May 18 03:58:49 2011
@@ -75,39 +75,41 @@ public class TestCachingCollector extend
   }
 
   public void testBasic() throws Exception {
-    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 1);
-    cc.setScorer(new MockScorer());
-    
-    // collect 1000 docs
-    for (int i = 0; i < 1000; i++) {
-      cc.collect(i);
-    }
-    
-    // now replay them
-    cc.replay(new Collector() {
-      int prevDocID = -1;
-      
-      @Override
-      public void setScorer(Scorer scorer) throws IOException {}
-      
-      @Override
-      public void setNextReader(IndexReader reader, int docBase) throws IOException {}
+    for (boolean cacheScores : new boolean[] { false, true }) {
+      CachingCollector cc = CachingCollector.create(new NoOpCollector(false), cacheScores, 1);
+      cc.setScorer(new MockScorer());
       
-      @Override
-      public void collect(int doc) throws IOException {
-        assertEquals(prevDocID + 1, doc);
-        prevDocID = doc;
+      // collect 1000 docs
+      for (int i = 0; i < 1000; i++) {
+        cc.collect(i);
       }
       
-      @Override
-      public boolean acceptsDocsOutOfOrder() {
-        return false;
-      }
-    });
+      // now replay them
+      cc.replay(new Collector() {
+        int prevDocID = -1;
+        
+        @Override
+        public void setScorer(Scorer scorer) throws IOException {}
+        
+        @Override
+        public void setNextReader(IndexReader reader, int docBase) throws IOException {}
+        
+        @Override
+        public void collect(int doc) throws IOException {
+          assertEquals(prevDocID + 1, doc);
+          prevDocID = doc;
+        }
+        
+        @Override
+        public boolean acceptsDocsOutOfOrder() {
+          return false;
+        }
+      });
+    }
   }
   
   public void testIllegalStateOnReplay() throws Exception {
-    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 50 * ONE_BYTE);
+    CachingCollector cc = CachingCollector.create(new NoOpCollector(false), true, 50 * ONE_BYTE);
     cc.setScorer(new MockScorer());
     
     // collect 130 docs, this should be enough for triggering cache abort.
@@ -130,14 +132,14 @@ public class TestCachingCollector extend
     // is valid with the Collector passed to the ctor
     
     // 'src' Collector does not support out-of-order
-    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 50 * ONE_BYTE);
+    CachingCollector cc = CachingCollector.create(new NoOpCollector(false), true, 50 * ONE_BYTE);
     cc.setScorer(new MockScorer());
     for (int i = 0; i < 10; i++) cc.collect(i);
     cc.replay(new NoOpCollector(true)); // this call should not fail
     cc.replay(new NoOpCollector(false)); // this call should not fail
 
     // 'src' Collector supports out-of-order
-    cc = new CachingCollector(new NoOpCollector(true), true, 50 * ONE_BYTE);
+    cc = CachingCollector.create(new NoOpCollector(true), true, 50 * ONE_BYTE);
     cc.setScorer(new MockScorer());
     for (int i = 0; i < 10; i++) cc.collect(i);
     cc.replay(new NoOpCollector(true)); // this call should not fail
@@ -156,14 +158,18 @@ public class TestCachingCollector extend
     
     // set RAM limit enough for 150 docs + random(10000)
     int numDocs = random.nextInt(10000) + 150;
-    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 8 * ONE_BYTE * numDocs);
-    cc.setScorer(new MockScorer());
-    for (int i = 0; i < numDocs; i++) cc.collect(i);
-    assertTrue(cc.isCached());
-    
-    // The 151's document should terminate caching
-    cc.collect(numDocs);
-    assertFalse(cc.isCached());
+    for (boolean cacheScores : new boolean[] { false, true }) {
+      int bytesPerDoc = cacheScores ? 8 : 4;
+      CachingCollector cc = CachingCollector.create(new NoOpCollector(false),
+          cacheScores, bytesPerDoc * ONE_BYTE * numDocs);
+      cc.setScorer(new MockScorer());
+      for (int i = 0; i < numDocs; i++) cc.collect(i);
+      assertTrue(cc.isCached());
+      
+      // The 151's document should terminate caching
+      cc.collect(numDocs);
+      assertFalse(cc.isCached());
+    }
   }
   
 }