You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2011/05/16 22:29:11 UTC
svn commit: r1103872 - in /lucene/dev/trunk: lucene/src/java/org/apache/lucene/search/ lucene/src/test/org/apache/lucene/search/ modules/grouping/src/java/org/apache/lucene/search/grouping/ modules/grouping/src/test/org/apache/lucene/search/grouping/

Author: shaie
Date: Mon May 16 20:29:10 2011
New Revision: 1103872

URL: http://svn.apache.org/viewvc?rev=1103872&view=rev
Log:
LUCENE-3102: first cut - some refactoring, bug fixes, add test, move to core (trunk)

Added:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/CachingCollector.java
      - copied, changed from r1103861, lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java   (with props)
Removed:
    lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java
Modified:
    lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
    lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java

Copied: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/CachingCollector.java (from r1103861, lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/CachingCollector.java?p2=lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/CachingCollector.java&p1=lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java&r1=1103861&r2=1103872&rev=1103872&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/CachingCollector.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/CachingCollector.java Mon May 16 20:29:10 2011
@@ -1,4 +1,4 @@
-package org.apache.lucene.search.grouping;
+package org.apache.lucene.search;
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,8 +22,6 @@ import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.Scorer;
 import org.apache.lucene.util.RamUsageEstimator;
 
 /**
@@ -41,6 +39,9 @@ import org.apache.lucene.util.RamUsageEs
  * set is large this can easily be a very substantial amount
  * of RAM!
  * 
+ * <p><b>NOTE</b>: this class caches at least 128 documents
+ * before checking RAM limits.
+ * 
  * <p>See {@link org.apache.lucene.search.grouping} for more
  * details including a full code example.</p>
  *
@@ -48,6 +49,11 @@ import org.apache.lucene.util.RamUsageEs
  */
 public class CachingCollector extends Collector {
   
+  // Max out at 512K arrays
+  private static final int MAX_ARRAY_SIZE = 512 * 1024;
+  private static final int INITIAL_ARRAY_SIZE = 128;
+  private final static int[] EMPTY_INT_ARRAY = new int[0];
+
   private static class SegStart {
     public final AtomicReaderContext readerContext;
     public final int end;
@@ -57,6 +63,33 @@ public class CachingCollector extends Co
       this.end = end;
     }
   }
+  
+  private static class CachedScorer extends Scorer {
+
+    // NOTE: these members are package-private b/c that way accessing them from
+    // the outer class does not incur access check by the JVM. The same
+    // situation would be if they were defined in the outer class as private
+    // members.
+    int doc;
+    float score;
+    
+    private CachedScorer() { super(null); }
+
+    @Override
+    public float score() { return score; }
+
+    @Override
+    public int advance(int target) { throw new UnsupportedOperationException(); }
+
+    @Override
+    public int docID() { return doc; }
+
+    @Override
+    public float freq() { throw new UnsupportedOperationException(); }
+
+    @Override
+    public int nextDoc() { throw new UnsupportedOperationException(); }
+  }
 
   // TODO: would be nice if a collector defined a
   // needsScores() method so we can specialize / do checks
@@ -64,7 +97,8 @@ public class CachingCollector extends Co
   private final Collector other;
   private final int maxDocsToCache;
 
-  private final Scorer cachedScorer;
+  private final boolean cacheScores;
+  private final CachedScorer cachedScorer;
   private final List<int[]> cachedDocs;
   private final List<float[]> cachedScores;
   private final List<SegStart> cachedSegs = new ArrayList<SegStart>();
@@ -74,39 +108,13 @@ public class CachingCollector extends Co
   private float[] curScores;
   private int upto;
   private AtomicReaderContext lastReaderContext;
-  private float score;
   private int base;
-  private int doc;
 
   public CachingCollector(Collector other, boolean cacheScores, double maxRAMMB) {
     this.other = other;
+    this.cacheScores = cacheScores;
     if (cacheScores) {
-      cachedScorer = new Scorer(null) {
-          @Override
-          public float score() {
-            return score;
-          }
-
-          @Override
-          public int advance(int target) {
-            throw new UnsupportedOperationException();
-          }
-
-          @Override
-          public int docID() {
-            return doc;
-          }
-
-          @Override
-          public float freq() {
-            throw new UnsupportedOperationException();
-          }
-
-          @Override
-          public int nextDoc() {
-            throw new UnsupportedOperationException();
-          }
-        };
+      cachedScorer = new CachedScorer();
       cachedScores = new ArrayList<float[]>();
       curScores = new float[128];
       cachedScores.add(curScores);
@@ -115,16 +123,14 @@ public class CachingCollector extends Co
       cachedScores = null;
     }
     cachedDocs = new ArrayList<int[]>();
-    curDocs = new int[128];
+    curDocs = new int[INITIAL_ARRAY_SIZE];
     cachedDocs.add(curDocs);
 
-    final int bytesPerDoc;
-    if (curScores != null) {
-      bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT + RamUsageEstimator.NUM_BYTES_FLOAT;
-    } else {
-      bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
+    int bytesPerDoc = RamUsageEstimator.NUM_BYTES_INT;
+    if (cacheScores) {
+      bytesPerDoc += RamUsageEstimator.NUM_BYTES_FLOAT;
     }
-    maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024)/bytesPerDoc);
+    maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
   }
   
   @Override
@@ -143,52 +149,60 @@ public class CachingCollector extends Co
 
     if (curDocs == null) {
       // Cache was too large
-      if (curScores != null) {
-        score = scorer.score();
+      if (cacheScores) {
+        cachedScorer.score = scorer.score();
       }
-      this.doc = doc;
+      cachedScorer.doc = doc;
       other.collect(doc);
       return;
     }
 
+    // Allocate a bigger array or abort caching
     if (upto == curDocs.length) {
       base += upto;
-      final int nextLength;
-      // Max out at 512K arrays:
-      if (curDocs.length < 524288) {
-        nextLength = 8*curDocs.length;
-      } else {
-        nextLength = curDocs.length;
+      
+      // Compute next array length - don't allocate too big arrays
+      int nextLength = 8*curDocs.length;
+      if (nextLength > MAX_ARRAY_SIZE) {
+        nextLength = MAX_ARRAY_SIZE;
       }
 
       if (base + nextLength > maxDocsToCache) {
-        // Too many docs to collect -- clear cache
-        curDocs = null;
-        if (curScores != null) {
-          score = scorer.score();
+        // try to allocate a smaller array
+        nextLength = maxDocsToCache - base;
+        if (nextLength <= 0) {
+          // Too many docs to collect -- clear cache
+          curDocs = null;
+          curScores = null;
+          cachedSegs.clear();
+          cachedDocs.clear();
+          cachedScores.clear();
+          if (cacheScores) {
+            cachedScorer.score = scorer.score();
+          }
+          cachedScorer.doc = doc;
+          other.collect(doc);
+          return;
         }
-        this.doc = doc;
-        other.collect(doc);
-        cachedDocs.clear();
-        cachedScores.clear();
-        return;
       }
+      
       curDocs = new int[nextLength];
       cachedDocs.add(curDocs);
-      if (curScores != null) {
+      if (cacheScores) {
         curScores = new float[nextLength];
         cachedScores.add(curScores);
       }
       upto = 0;
     }
+    
     curDocs[upto] = doc;
     // TODO: maybe specialize private subclass so we don't
     // null check per collect...
-    if (curScores != null) {
-      score = curScores[upto] = scorer.score();
+    if (cacheScores) {
+      cachedScorer.score = curScores[upto] = scorer.score();
     }
     upto++;
-    this.doc = doc;
+    cachedScorer.doc = doc;
     other.collect(doc);
   }
 
@@ -205,55 +219,65 @@ public class CachingCollector extends Co
     lastReaderContext = context;
   }
 
-  private final static int[] EMPTY_INT_ARRAY = new int[0];
-
   @Override
   public String toString() {
     if (isCached()) {
-      return "CachingCollector (" + (base+upto) + " docs " + (curScores != null ? " & scores" : "") + " cached)";
+      return "CachingCollector (" + (base+upto) + " docs " + (cacheScores ? " & scores" : "") + " cached)";
     } else {
       return "CachingCollector (cache was cleared)";
     }
   }
 
+  /**
+   * Replays the cached doc IDs (and scores) to the given Collector.
+   * 
+   * @throws IllegalStateException
+   *           if this collector is not cached (i.e., if the RAM limits were too
+   *           low for the number of documents + scores to cache).
+   * @throws IllegalArgumentException
+   *           if the given Collect's does not support out-of-order collection,
+   *           while the collector passed to the ctor does.
+   */
   public void replay(Collector other) throws IOException {
     if (!isCached()) {
       throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
     }
+    
+    if (!other.acceptsDocsOutOfOrder() && this.other.acceptsDocsOutOfOrder()) {
+      throw new IllegalArgumentException(
+          "cannot replay: given collector does not support "
+              + "out-of-order collection, while the wrapped collector does. "
+              + "Therefore cached documents may be out-of-order.");
+    }
+
     //System.out.println("CC: replay totHits=" + (upto + base));
     if (lastReaderContext != null) {
       cachedSegs.add(new SegStart(lastReaderContext, base+upto));
       lastReaderContext = null;
     }
-    final int uptoSav = upto;
-    final int baseSav = base;
-    try {
-      upto = 0;
-      base = 0;
-      int chunkUpto = 0;
-      other.setScorer(cachedScorer);
-      curDocs = EMPTY_INT_ARRAY;
-      for(SegStart seg : cachedSegs) {
-        other.setNextReader(seg.readerContext);
-        while(base+upto < seg.end) {
-          if (upto == curDocs.length) {
-            base += curDocs.length;
-            curDocs = cachedDocs.get(chunkUpto);
-            if (curScores != null) {
-              curScores = cachedScores.get(chunkUpto);
-            }
-            chunkUpto++;
-            upto = 0;
-          }
-          if (curScores != null) {
-            score = curScores[upto];
+    
+    int curupto = 0;
+    int curbase = 0;
+    int chunkUpto = 0;
+    other.setScorer(cachedScorer);
+    curDocs = EMPTY_INT_ARRAY;
+    for(SegStart seg : cachedSegs) {
+      other.setNextReader(seg.readerContext);
+      while(curbase+curupto < seg.end) {
+        if (curupto == curDocs.length) {
+          curbase += curDocs.length;
+          curDocs = cachedDocs.get(chunkUpto);
+          if (cacheScores) {
+            curScores = cachedScores.get(chunkUpto);
           }
-          other.collect(curDocs[upto++]);
+          chunkUpto++;
+          curupto = 0;
+        }
+        if (cacheScores) {
+          cachedScorer.score = curScores[curupto];
         }
+        other.collect(curDocs[curupto++]);
       }
-    } finally {
-      upto = uptoSav;
-      base = baseSav;
     }
   }
 }

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java?rev=1103872&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/search/TestCachingCollector.java Mon May 16 20:29:10 2011
@@ -0,0 +1,169 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.CachingCollector;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCachingCollector extends LuceneTestCase {
+
+  private static final double ONE_BYTE = 1.0 / (1024 * 1024); // 1 byte out of MB
+  
+  private static class MockScorer extends Scorer {
+    
+    private MockScorer() {
+      super((Weight) null);
+    }
+    
+    @Override
+    public float score() throws IOException { return 0; }
+
+    @Override
+    public int docID() { return 0; }
+
+    @Override
+    public int nextDoc() throws IOException { return 0; }
+
+    @Override
+    public int advance(int target) throws IOException { return 0; }
+    
+  }
+  
+  private static class NoOpCollector extends Collector {
+
+    private final boolean acceptDocsOutOfOrder;
+    
+    public NoOpCollector(boolean acceptDocsOutOfOrder) {
+      this.acceptDocsOutOfOrder = acceptDocsOutOfOrder;
+    }
+    
+    @Override
+    public void setScorer(Scorer scorer) throws IOException {}
+
+    @Override
+    public void collect(int doc) throws IOException {}
+
+    @Override
+    public void setNextReader(AtomicReaderContext context) throws IOException {}
+
+    @Override
+    public boolean acceptsDocsOutOfOrder() {
+      return acceptDocsOutOfOrder;
+    }
+    
+  }
+
+  public void testBasic() throws Exception {
+    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 1);
+    cc.setScorer(new MockScorer());
+    
+    // collect 1000 docs
+    for (int i = 0; i < 1000; i++) {
+      cc.collect(i);
+    }
+    
+    // now replay them
+    cc.replay(new Collector() {
+      int prevDocID = -1;
+      
+      @Override
+      public void setScorer(Scorer scorer) throws IOException {}
+      
+      @Override
+      public void setNextReader(AtomicReaderContext context) throws IOException {}
+      
+      @Override
+      public void collect(int doc) throws IOException {
+        assertEquals(prevDocID + 1, doc);
+        prevDocID = doc;
+      }
+      
+      @Override
+      public boolean acceptsDocsOutOfOrder() {
+        return false;
+      }
+    });
+  }
+  
+  public void testIllegalStateOnReplay() throws Exception {
+    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 50 * ONE_BYTE);
+    cc.setScorer(new MockScorer());
+    
+    // collect 130 docs, this should be enough for triggering cache abort.
+    for (int i = 0; i < 130; i++) {
+      cc.collect(i);
+    }
+    
+    assertFalse("CachingCollector should not be cached due to low memory limit", cc.isCached());
+    
+    try {
+      cc.replay(new NoOpCollector(false));
+      fail("replay should fail if CachingCollector is not cached");
+    } catch (IllegalStateException e) {
+      // expected
+    }
+  }
+  
+  public void testIllegalCollectorOnReplay() throws Exception {
+    // tests that the Collector passed to replay() has an out-of-order mode that
+    // is valid with the Collector passed to the ctor
+    
+    // 'src' Collector does not support out-of-order
+    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 50 * ONE_BYTE);
+    cc.setScorer(new MockScorer());
+    for (int i = 0; i < 10; i++) cc.collect(i);
+    cc.replay(new NoOpCollector(true)); // this call should not fail
+    cc.replay(new NoOpCollector(false)); // this call should not fail
+
+    // 'src' Collector supports out-of-order
+    cc = new CachingCollector(new NoOpCollector(true), true, 50 * ONE_BYTE);
+    cc.setScorer(new MockScorer());
+    for (int i = 0; i < 10; i++) cc.collect(i);
+    cc.replay(new NoOpCollector(true)); // this call should not fail
+    try {
+      cc.replay(new NoOpCollector(false)); // this call should fail
+      fail("should have failed if an in-order Collector was given to replay(), " +
+      		"while CachingCollector was initialized with out-of-order collection");
+    } catch (IllegalArgumentException e) {
+      // ok
+    }
+  }
+  
+  public void testCachedArraysAllocation() throws Exception {
+    // tests the cached arrays allocation -- if the 'nextLength' was too high,
+    // caching would terminate even if a smaller length would suffice.
+    
+    // set RAM limit enough for 150 docs + random(10000)
+    int numDocs = random.nextInt(10000) + 150;
+    CachingCollector cc = new CachingCollector(new NoOpCollector(false), true, 8 * ONE_BYTE * numDocs);
+    cc.setScorer(new MockScorer());
+    for (int i = 0; i < numDocs; i++) cc.collect(i);
+    assertTrue(cc.isCached());
+    
+    // The 151's document should terminate caching
+    cc.collect(numDocs);
+    assertFalse(cc.isCached());
+  }
+  
+}

Modified: lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html?rev=1103872&r1=1103871&r2=1103872&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html (original)
+++ lucene/dev/trunk/modules/grouping/src/java/org/apache/lucene/search/grouping/package.html Mon May 16 20:29:10 2011
@@ -49,7 +49,7 @@ field fall into a single group.</p>
   org.apache.lucene.search.grouping.SecondPassGroupingCollector})
   gathers documents within those groups.  If the search is costly to
   run you may want to use the {@link
-  org.apache.lucene.search.grouping.CachingCollector} class, which
+  org.apache.lucene.search.CachingCollector} class, which
   caches hits and can (quickly) replay them for the second pass.  This
   way you only run the query once, but you pay a RAM cost to (briefly)
   hold all hits.  Results are returned as a {@link
@@ -66,7 +66,7 @@ field fall into a single group.</p>
     group yourself.
 </ul>
 
-<p>Typical usage looks like this (using the {@link org.apache.lucene.search.grouping.CachingCollector}):</p>
+<p>Typical usage looks like this (using the {@link org.apache.lucene.search.CachingCollector}):</p>
 
 <pre>
   FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("author", groupSort, groupOffset+topNGroups);

Modified: lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1103872&r1=1103871&r2=1103872&view=diff
==============================================================================
--- lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/trunk/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Mon May 16 20:29:10 2011
@@ -32,6 +32,7 @@ import org.apache.lucene.document.Numeri
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.search.CachingCollector;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.FieldCache;
 import org.apache.lucene.search.FieldDoc;