You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2013/05/29 10:28:16 UTC
svn commit: r1487399 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/facet/ lucene/facet/src/java/org/apache/lucene/facet/sampling/
lucene/facet/src/java/org/apache/lucene/facet/search/
lucene/facet/src/test/org/apache/lucene/facet/sampling/
Author: shaie
Date: Wed May 29 08:28:15 2013
New Revision: 1487399
URL: http://svn.apache.org/r1487399
Log:
LUCENE-5015: Unexpected performance difference between SamplingAccumulator and StandardFacetAccumulator
Added:
lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java
- copied unchanged from r1487397, lucene/dev/trunk/lucene/facet/src/test/org/apache/lucene/facet/sampling/SamplerTest.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/facet/ (props changed)
lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java
lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java
lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java
lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java
lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java
lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java
lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed May 29 08:28:15 2013
@@ -54,6 +54,10 @@ Changes in backwards compatibility polic
DictionaryCompoundWordTokenFilter and HyphenationCompoundWordTokenFilter don't
update offsets anymore. (Adrien Grand)
+* LUCENE-5015: SamplingAccumulator no longer corrects the counts of the sampled
+ categories. You should set TakmiSampleFixer on SamplingParams if required (but
+ notice that this means slower search). (Rob Audenaerde, Gilad Barkai, Shai Erera)
+
Bug Fixes
* LUCENE-4997: Internal test framework's tests are sensitive to previous
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SampleFixer.java Wed May 29 08:28:15 2013
@@ -3,6 +3,7 @@ package org.apache.lucene.facet.sampling
import java.io.IOException;
import org.apache.lucene.facet.search.FacetResult;
+import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.ScoredDocIDs;
/*
@@ -23,22 +24,50 @@ import org.apache.lucene.facet.search.Sc
*/
/**
- * Fixer of sample facet accumulation results
+ * Fixer of sample facet accumulation results.
*
* @lucene.experimental
*/
-public interface SampleFixer {
+public abstract class SampleFixer {
/**
* Alter the input result, fixing it to account for the sampling. This
- * implementation can compute accurate or estimated counts for the sampled facets.
- * For example, a faster correction could just multiply by a compensating factor.
+ * implementation can compute accurate or estimated counts for the sampled
+ * facets. For example, a faster correction could just multiply by a
+ * compensating factor.
*
* @param origDocIds
* full set of matching documents.
* @param fres
* sample result to be fixed.
- * @throws IOException If there is a low-level I/O error.
+ * @throws IOException
+ * If there is a low-level I/O error.
*/
- public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException;
+ public void fixResult(ScoredDocIDs origDocIds, FacetResult fres, double samplingRatio) throws IOException {
+ FacetResultNode topRes = fres.getFacetResultNode();
+ fixResultNode(topRes, origDocIds, samplingRatio);
+ }
+
+ /**
+ * Fix result node count, and, recursively, fix all its children
+ *
+ * @param facetResNode
+ * result node to be fixed
+ * @param docIds
+ * docids in effect
+ * @throws IOException
+ * If there is a low-level I/O error.
+ */
+ protected void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio)
+ throws IOException {
+ singleNodeFix(facetResNode, docIds, samplingRatio);
+ for (FacetResultNode frn : facetResNode.subResults) {
+ fixResultNode(frn, docIds, samplingRatio);
+ }
+ }
+
+ /** Fix the given node's value. */
+ protected abstract void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio)
+ throws IOException;
+
}
\ No newline at end of file
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/Sampler.java Wed May 29 08:28:15 2013
@@ -12,7 +12,6 @@ import org.apache.lucene.facet.search.Fa
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
-import org.apache.lucene.index.IndexReader;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -111,16 +110,6 @@ public abstract class Sampler {
throws IOException;
/**
- * Get a fixer of sample facet accumulation results. Default implementation
- * returns a <code>TakmiSampleFixer</code> which is adequate only for
- * counting. For any other accumulator, provide a different fixer.
- */
- public SampleFixer getSampleFixer(IndexReader indexReader, TaxonomyReader taxonomyReader,
- FacetSearchParams searchParams) {
- return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
- }
-
- /**
* Result of sample computation
*/
public final static class SampleResult {
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingAccumulator.java Wed May 29 08:28:15 2013
@@ -79,7 +79,11 @@ public class SamplingAccumulator extends
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
// Replacing the original searchParams with the over-sampled
FacetSearchParams original = searchParams;
- searchParams = sampler.overSampledSearchParams(original);
+ SampleFixer samplerFixer = sampler.samplingParams.getSampleFixer();
+ final boolean shouldOversample = sampler.samplingParams.shouldOverSample();
+ if (shouldOversample) {
+ searchParams = sampler.overSampledSearchParams(original);
+ }
List<FacetResult> sampleRes = super.accumulate(docids);
@@ -87,14 +91,18 @@ public class SamplingAccumulator extends
for (FacetResult fres : sampleRes) {
// for sure fres is not null because this is guaranteed by the delegee.
PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
- // fix the result of current request
- sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
-
- fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
-
- // Using the sampler to trim the extra (over-sampled) results
- fres = sampler.trimResult(fres);
+ if (samplerFixer != null) {
+ // fix the result of current request
+ samplerFixer.fixResult(docids, fres, samplingRatio);
+
+ fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any arranging it needs to
+ if (shouldOversample) {
+ // Using the sampler to trim the extra (over-sampled) results
+ fres = sampler.trimResult(fres);
+ }
+ }
+
// final labeling if allowed (because labeling is a costly operation)
frh.labelResult(fres);
fixedRes.add(fres); // add to final results
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingParams.java Wed May 29 08:28:15 2013
@@ -28,7 +28,7 @@ public class SamplingParams {
* Default factor by which more results are requested over the sample set.
* @see SamplingParams#getOversampleFactor()
*/
- public static final double DEFAULT_OVERSAMPLE_FACTOR = 2d;
+ public static final double DEFAULT_OVERSAMPLE_FACTOR = 1d;
/**
* Default ratio between size of sample to original size of document set.
@@ -59,6 +59,8 @@ public class SamplingParams {
private double sampleRatio = DEFAULT_SAMPLE_RATIO;
private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR;
+
+ private SampleFixer sampleFixer = null;
/**
* Return the maxSampleSize.
@@ -166,4 +168,29 @@ public class SamplingParams {
this.oversampleFactor = oversampleFactor;
}
-}
\ No newline at end of file
+ /**
+ * @return {@link SampleFixer} to be used while fixing the sampled results, if
+ * <code>null</code> no fixing will be performed
+ */
+ public SampleFixer getSampleFixer() {
+ return sampleFixer;
+ }
+
+ /**
+ * Set a {@link SampleFixer} to be used while fixing the sampled results.
+ * {@code null} means no fixing will be performed
+ */
+ public void setSampleFixer(SampleFixer sampleFixer) {
+ this.sampleFixer = sampleFixer;
+ }
+
+ /**
+ * Returns whether over-sampling should be done. By default returns
+ * {@code true} when {@link #getSampleFixer()} is not {@code null} and
+ * {@link #getOversampleFactor()} > 1, {@code false} otherwise.
+ */
+ public boolean shouldOverSample() {
+ return sampleFixer != null && oversampleFactor > 1d;
+ }
+
+}
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/SamplingWrapper.java Wed May 29 08:28:15 2013
@@ -52,29 +52,41 @@ public class SamplingWrapper extends Sta
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
// Replacing the original searchParams with the over-sampled (and without statistics-compute)
FacetSearchParams original = delegee.searchParams;
- delegee.searchParams = sampler.overSampledSearchParams(original);
+ boolean shouldOversample = sampler.samplingParams.shouldOverSample();
+
+ if (shouldOversample) {
+ delegee.searchParams = sampler.overSampledSearchParams(original);
+ }
SampleResult sampleSet = sampler.getSampleSet(docids);
List<FacetResult> sampleRes = delegee.accumulate(sampleSet.docids);
List<FacetResult> fixedRes = new ArrayList<FacetResult>();
+ SampleFixer sampleFixer = sampler.samplingParams.getSampleFixer();
+
for (FacetResult fres : sampleRes) {
// for sure fres is not null because this is guaranteed by the delegee.
PartitionsFacetResultsHandler frh = createFacetResultsHandler(fres.getFacetRequest());
- // fix the result of current request
- sampler.getSampleFixer(indexReader, taxonomyReader, searchParams).fixResult(docids, fres);
- fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
+ if (sampleFixer != null) {
+ // fix the result of current request
+ sampleFixer.fixResult(docids, fres, sampleSet.actualSampleRatio);
+ fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
+ }
- // Using the sampler to trim the extra (over-sampled) results
- fres = sampler.trimResult(fres);
+ if (shouldOversample) {
+ // Using the sampler to trim the extra (over-sampled) results
+ fres = sampler.trimResult(fres);
+ }
// final labeling if allowed (because labeling is a costly operation)
frh.labelResult(fres);
fixedRes.add(fres); // add to final results
}
- delegee.searchParams = original; // Back to original params
+ if (shouldOversample) {
+ delegee.searchParams = original; // Back to original params
+ }
return fixedRes;
}
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/sampling/TakmiSampleFixer.java Wed May 29 08:28:15 2013
@@ -2,21 +2,19 @@ package org.apache.lucene.facet.sampling
import java.io.IOException;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.util.Bits;
-
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.DrillDownQuery;
-import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetResultNode;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.Bits;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -36,16 +34,21 @@ import org.apache.lucene.facet.taxonomy.
*/
/**
- * Fix sampling results by counting the intersection between two lists: a
- * TermDocs (list of documents in a certain category) and a DocIdSetIterator
- * (list of documents matching the query).
- *
+ * Fix sampling results by correct results, by counting the intersection between
+ * two lists: a TermDocs (list of documents in a certain category) and a
+ * DocIdSetIterator (list of documents matching the query).
+ * <p>
+ * This fixer is suitable for scenarios which prioritize accuracy over
+ * performance.
+ * <p>
+ * <b>Note:</b> for statistically more accurate top-k selection, set
+ * {@link SamplingParams#setOversampleFactor(double) oversampleFactor} to at
+ * least 2, so that the top-k categories would have better chance of showing up
+ * in the sampled top-cK results (see {@link SamplingParams#getOversampleFactor}
*
* @lucene.experimental
*/
-// TODO (Facet): implement also an estimated fixing by ratio (taking into
-// account "translation" of counts!)
-class TakmiSampleFixer implements SampleFixer {
+public class TakmiSampleFixer extends SampleFixer {
private TaxonomyReader taxonomyReader;
private IndexReader indexReader;
@@ -59,28 +62,10 @@ class TakmiSampleFixer implements Sample
}
@Override
- public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
- throws IOException {
- FacetResultNode topRes = fres.getFacetResultNode();
- fixResultNode(topRes, origDocIds);
- }
-
- /**
- * Fix result node count, and, recursively, fix all its children
- *
- * @param facetResNode
- * result node to be fixed
- * @param docIds
- * docids in effect
- * @throws IOException If there is a low-level I/O error.
- */
- private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) throws IOException {
+ public void singleNodeFix(FacetResultNode facetResNode, ScoredDocIDs docIds, double samplingRatio) throws IOException {
recount(facetResNode, docIds);
- for (FacetResultNode frn : facetResNode.subResults) {
- fixResultNode(frn, docIds);
- }
}
-
+
/**
* Internal utility: recount for a facet result node
*
@@ -179,4 +164,5 @@ class TakmiSampleFixer implements Sample
}
return false; // exhausted
}
+
}
\ No newline at end of file
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java Wed May 29 08:28:15 2013
@@ -94,7 +94,7 @@ public class StandardFacetsAccumulator e
private Object accumulateGuard;
- private double complementThreshold;
+ private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
Modified: lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java?rev=1487399&r1=1487398&r2=1487399&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java (original)
+++ lucene/dev/branches/branch_4x/lucene/facet/src/test/org/apache/lucene/facet/sampling/BaseSampleTestTopK.java Wed May 29 08:28:15 2013
@@ -94,7 +94,7 @@ public abstract class BaseSampleTestTopK
for (int nTrial = 0; nTrial < RETRIES; nTrial++) {
try {
// complement with sampling!
- final Sampler sampler = createSampler(nTrial, useRandomSampler);
+ final Sampler sampler = createSampler(nTrial, useRandomSampler, samplingSearchParams);
assertSampling(expectedResults, q, sampler, samplingSearchParams, false);
assertSampling(expectedResults, q, sampler, samplingSearchParams, true);
@@ -128,14 +128,20 @@ public abstract class BaseSampleTestTopK
return FacetsCollector.create(sfa);
}
- private Sampler createSampler(int nTrial, boolean useRandomSampler) {
+ private Sampler createSampler(int nTrial, boolean useRandomSampler, FacetSearchParams sParams) {
SamplingParams samplingParams = new SamplingParams();
+ /*
+ * Set sampling to Exact fixing with TakmiSampleFixer as it is not easy to
+ * validate results with amortized results.
+ */
+ samplingParams.setSampleFixer(new TakmiSampleFixer(indexReader, taxoReader, sParams));
+
final double retryFactor = Math.pow(1.01, nTrial);
+ samplingParams.setOversampleFactor(5.0 * retryFactor); // Oversampling
samplingParams.setSampleRatio(0.8 * retryFactor);
samplingParams.setMinSampleSize((int) (100 * retryFactor));
samplingParams.setMaxSampleSize((int) (10000 * retryFactor));
- samplingParams.setOversampleFactor(5.0 * retryFactor);
samplingParams.setSamplingThreshold(11000); //force sampling
Sampler sampler = useRandomSampler ?