You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2013/05/29 10:28:16 UTC

svn commit: r1487399 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/facet/ lucene/facet/src/java/org/apache/lucene/facet/sampling/ lucene/facet/src/java/org/apache/lucene/facet/search/ lucene/facet/src/test/org/apache/lucene/facet/sampling/

Author: shaie
Date: Wed May 29 08:28:15 2013
New Revision: 1487399

URL: http://svn.apache.org/r1487399
Log:
LUCENE-5015: Unexpected performance difference between SamplingAccumulator and StandardFacetAccumulator

Added:
    lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java
      - copied unchanged from r1487397, lucene/dev/trunk/lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/facet/   (props changed)
    lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java
    lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java
    lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java
    lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java
    lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java
    lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java
    lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
    lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed May 29 08:28:15 2013
@@ -54,6 +54,10 @@ Changes in backwards compatibility polic
   DictionaryCompoundWordTokenFilter and HyphenationCompoundWordTokenFilter don't
   update offsets anymore. (Adrien Grand)
 
+* LUCENE-5015: SamplingAccumulator no longer corrects the counts of the sampled 
+  categories. You should set TakmiSampleFixer on SamplingParams if required (but 
+  notice that this means slower search). (Rob Audenaerde, Gilad Barkai, Shai Erera)
+
 Bug Fixes
 
 * LUCENE-4997: Internal test framework's tests are sensitive to previous 

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java Wed May 29 08:28:15 2013
@@ -3,6 +3,7 @@ package org.apache.lucene.facet.sampling
 import java.io.IOException;
 
 import org.apache.lucene.facet.search.FacetResult;
+import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;
 
 /*
@@ -23,22 +24,50 @@ import org.apache.lucene.facet.search.Sc
  */
 
 /**
- * Fixer of sample facet accumulation results
+ * Fixer of sample facet accumulation results.
  * 
  * @lucene.experimental
  */
-public interface SampleFixer {
+public abstract class SampleFixer {
   
   /**
    * Alter the input result, fixing it to account for the sampling. This
-   * implementation can compute accurate or estimated counts for the sampled facets. 
-   * For example, a faster correction could just multiply by a compensating factor.
+   * implementation can compute accurate or estimated counts for the sampled
+   * facets. For example, a faster correction could just multiply by a
+   * compensating factor.
    * 
    * @param origDocIds
    *          full set of matching documents.
    * @param fres
    *          sample result to be fixed.
-   * @throws IOException If there is a low-level I/O error.
+   * @throws IOException
+   *           If there is a low-level I/O error.
    */
-  public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException; 
+  public void fixResult(ScoredDocIDs origDocIds, FacetResult fres, double samplingRatio) throws IOException {
+    FacetResultNode topRes = fres.getFacetResultNode();
+    fixResultNode(topRes, origDocIds, samplingRatio);
+  }
+  
+  /**
+   * Fix result node count, and, recursively, fix all its children
+   * 
+   * @param facetResNode
+   *          result node to be fixed
+   * @param docIds
+   *          docids in effect
+   * @throws IOException
+   *           If there is a low-level I/O error.
+   */
+  protected void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) 
+      throws IOException {
+    singleNodeFix(facetResNode, docIds, samplingRatio);
+    for (FacetResultNode frn : facetResNode.subResults) {
+      fixResultNode(frn, docIds, samplingRatio);
+    }
+  }
+  
+  /** Fix the given node's value. */
+  protected abstract void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) 
+      throws IOException;
+  
 }
\ No newline at end of file

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java Wed May 29 08:28:15 2013
@@ -12,7 +12,6 @@ import org.apache.lucene.facet.search.Fa
 import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
-import org.apache.lucene.index.IndexReader;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -111,16 +110,6 @@ public abstract class Sampler {
       throws IOException;
 
   /**
-   * Get a fixer of sample facet accumulation results. Default implementation
-   * returns a <code>TakmiSampleFixer</code> which is adequate only for
-   * counting. For any other accumulator, provide a different fixer.
-   */
-  public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
-      FacetSearchParams searchParams) {
-    return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
-  }
-  
-  /**
    * Result of sample computation
    */
   public final static class SampleResult {

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java Wed May 29 08:28:15 2013
@@ -79,7 +79,11 @@ public class SamplingAccumulator extends
   public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
     // Replacing the original searchParams with the over-sampled
     FacetSearchParams original = searchParams;
-    searchParams = sampler.overSampledSearchParams(original);
+    SampleFixer samplerFixer = sampler.samplingParams.getSampleFixer();
+    final boolean shouldOversample = sampler.samplingParams.shouldOverSample();
+    if (shouldOversample) {
+      searchParams = sampler.overSampledSearchParams(original);
+    }
     
     List<FacetResult> sampleRes = super.accumulate(docids);
     
@@ -87,14 +91,18 @@ public class SamplingAccumulator extends
     for (FacetResult fres : sampleRes) {
       // for sure fres is not null because this is guaranteed by the delegee.
       PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
-      // fix the result of current request
-      sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
-      
-      fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
-
-      // Using the sampler to trim the extra (over-sampled) results
-      fres = sampler.trimResult(fres);
+      if (samplerFixer != null) {
+        // fix the result of current request
+        samplerFixer.fixResult(docids, fres, samplingRatio);
+        
+        fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
 
+        if (shouldOversample) {
+          // Using the sampler to trim the extra (over-sampled) results
+          fres = sampler.trimResult(fres);
+        }
+      }
+      
       // final labeling if allowed (because labeling is a costly operation)
       frh.labelResult(fres);
       fixedRes.add(fres); // add to final results

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java Wed May 29 08:28:15 2013
@@ -28,7 +28,7 @@ public class SamplingParams {
    * Default factor by which more results are requested over the sample set.
    * @see SamplingParams#getOversampleFactor()
    */
-  public static final double DEFAULT_OVERSAMPLE_FACTOR = 2d;
+  public static final double DEFAULT_OVERSAMPLE_FACTOR = 1d;
   
   /**
    * Default ratio between size of sample to original size of document set.
@@ -59,6 +59,8 @@ public class SamplingParams {
   private double sampleRatio = DEFAULT_SAMPLE_RATIO;
   private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
   private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR;
+
+  private SampleFixer sampleFixer = null;
   
   /**
    * Return the maxSampleSize.
@@ -166,4 +168,29 @@ public class SamplingParams {
     this.oversampleFactor = oversampleFactor;
   }
 
-}
\ No newline at end of file
+  /**
+   * @return {@link SampleFixer} to be used while fixing the sampled results, if
+   *         <code>null</code> no fixing will be performed
+   */
+  public SampleFixer getSampleFixer() {
+    return sampleFixer;
+  }
+
+  /**
+   * Set a {@link SampleFixer} to be used while fixing the sampled results.
+   * {@code null} means no fixing will be performed
+   */
+  public void setSampleFixer(SampleFixer sampleFixer) {
+    this.sampleFixer = sampleFixer;
+  }
+
+  /**
+   * Returns whether over-sampling should be done. By default returns
+   * {@code true} when {@link #getSampleFixer()} is not {@code null} and
+   * {@link #getOversampleFactor()} &gt; 1, {@code false} otherwise.
+   */
+  public boolean shouldOverSample() {
+    return sampleFixer != null && oversampleFactor > 1d;
+  }
+  
+}

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java Wed May 29 08:28:15 2013
@@ -52,29 +52,41 @@ public class SamplingWrapper extends Sta
   public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
     // Replacing the original searchParams with the over-sampled (and without statistics-compute)
     FacetSearchParams original = delegee.searchParams;
-    delegee.searchParams = sampler.overSampledSearchParams(original);
+    boolean shouldOversample = sampler.samplingParams.shouldOverSample();
+   
+    if (shouldOversample) {
+      delegee.searchParams = sampler.overSampledSearchParams(original);
+    }
     
     SampleResult sampleSet = sampler.getSampleSet(docids);
 
     List<FacetResult> sampleRes = delegee.accumulate(sampleSet.docids);
 
     List<FacetResult> fixedRes = new ArrayList<FacetResult>();
+    SampleFixer sampleFixer = sampler.samplingParams.getSampleFixer();
+    
     for (FacetResult fres : sampleRes) {
       // for sure fres is not null because this is guaranteed by the delegee.
       PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
-      // fix the result of current request
-      sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres); 
-      fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
+      if (sampleFixer != null) {
+        // fix the result of current request
+        sampleFixer.fixResult(docids, fres, sampleSet.actualSampleRatio); 
+        fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
+      }
       
-      // Using the sampler to trim the extra (over-sampled) results
-      fres = sampler.trimResult(fres);
+      if (shouldOversample) {
+        // Using the sampler to trim the extra (over-sampled) results
+        fres = sampler.trimResult(fres);
+      }
       
       // final labeling if allowed (because labeling is a costly operation)
       frh.labelResult(fres);
       fixedRes.add(fres); // add to final results
     }
 
-    delegee.searchParams = original; // Back to original params
+    if (shouldOversample) {
+      delegee.searchParams = original; // Back to original params
+    }
     
     return fixedRes; 
   }

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java Wed May 29 08:28:15 2013
@@ -2,21 +2,19 @@ package org.apache.lucene.facet.sampling
 
 import java.io.IOException;
 
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.util.Bits;
-
 import org.apache.lucene.facet.params.FacetSearchParams;
 import org.apache.lucene.facet.search.DrillDownQuery;
-import org.apache.lucene.facet.search.FacetResult;
 import org.apache.lucene.facet.search.FacetResultNode;
 import org.apache.lucene.facet.search.ScoredDocIDs;
 import org.apache.lucene.facet.search.ScoredDocIDsIterator;
 import org.apache.lucene.facet.taxonomy.CategoryPath;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.Bits;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -36,16 +34,21 @@ import org.apache.lucene.facet.taxonomy.
  */
 
 /**
- * Fix sampling results by counting the intersection between two lists: a
- * TermDocs (list of documents in a certain category) and a DocIdSetIterator
- * (list of documents matching the query).
- * 
+ * Fix sampling results by correct results, by counting the intersection between
+ * two lists: a TermDocs (list of documents in a certain category) and a
+ * DocIdSetIterator (list of documents matching the query).
+ * <p>
+ * This fixer is suitable for scenarios which prioritize accuracy over
+ * performance. 
+ * <p>
+ * <b>Note:</b> for statistically more accurate top-k selection, set
+ * {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
+ * least 2, so that the top-k categories would have better chance of showing up
+ * in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
  * 
  * @lucene.experimental
  */
-// TODO (Facet): implement also an estimated fixing by ratio (taking into
-// account "translation" of counts!)
-class TakmiSampleFixer implements SampleFixer {
+public class TakmiSampleFixer extends SampleFixer {
   
   private TaxonomyReader taxonomyReader;
   private IndexReader indexReader;
@@ -59,28 +62,10 @@ class TakmiSampleFixer implements Sample
   }
 
   @Override
-  public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
-      throws IOException {
-    FacetResultNode topRes = fres.getFacetResultNode();
-    fixResultNode(topRes, origDocIds);
-  }
-  
-  /**
-   * Fix result node count, and, recursively, fix all its children
-   * 
-   * @param facetResNode
-   *          result node to be fixed
-   * @param docIds
-   *          docids in effect
-   * @throws IOException If there is a low-level I/O error.
-   */
-  private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) throws IOException {
+  public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
     recount(facetResNode, docIds);
-    for (FacetResultNode frn : facetResNode.subResults) {
-      fixResultNode(frn, docIds);
-    }
   }
-
+  
   /**
    * Internal utility: recount for a facet result node
    * 
@@ -179,4 +164,5 @@ class TakmiSampleFixer implements Sample
     }
     return false; // exhausted
   }
+
 }
\ No newline at end of file

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java Wed May 29 08:28:15 2013
@@ -94,7 +94,7 @@ public class StandardFacetsAccumulator e
 
   private Object accumulateGuard;
 
-  private double complementThreshold;
+  private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
   
   public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, 
       TaxonomyReader taxonomyReader) {

Modified: lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java Wed May 29 08:28:15 2013
@@ -94,7 +94,7 @@ public abstract class BaseSampleTestTopK
         for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
           try {
             // complement with sampling!
-            final Sampler sampler = createSampler(nTrial, useRandomSampler);
+            final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams);
             
             assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
             assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
@@ -128,14 +128,20 @@ public abstract class BaseSampleTestTopK
     return FacetsCollector.create(sfa);
   }
   
-  private Sampler createSampler(int nTrial, boolean useRandomSampler) {
+  private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) {
     SamplingParams samplingParams = new SamplingParams();
     
+    /*
+     * Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to
+     * validate results with amortized results. 
+     */
+    samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams));
+        
     final double retryFactor = Math.pow(1.01, nTrial);
+    samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling 
     samplingParams.setSampleRatio(0.8 * retryFactor);
     samplingParams.setMinSampleSize((int) (100 * retryFactor));
     samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
-    samplingParams.setOversampleFactor(5.0 * retryFactor);
     samplingParams.setSamplingThreshold(11000); //force sampling
 
     Sampler sampler = useRandomSampler ?