You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2011/04/18 00:24:45 UTC
svn commit: r1094214 - in /lucene/dev/trunk/solr: CHANGES.txt
src/java/org/apache/solr/handler/component/FacetComponent.java
src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java
src/test/org/apache/solr/TestDistributedSearch.java
Author: yonik
Date: Sun Apr 17 22:24:45 2011
New Revision: 1094214
URL: http://svn.apache.org/viewvc?rev=1094214&view=rev
Log:
SOLR-2403: fix facet.mincount>0 && facet.sort=index errors, add some algorithmic improvements, introduce more random testing for distrib faceted testing
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java
lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java
lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Sun Apr 17 22:24:45 2011
@@ -270,7 +270,12 @@ Bug Fixes
* SOLR-2409: edismax parser - treat the text of a fielded query as a literal if the
fieldname does not exist. For example Mission: Impossible should not search on
the "Mission" field unless it's a valid field in the schema. (Ryan McKinley, yonik)
-
+
+* SOLR-2403: facet.sort=index reported incorrect results for distributed search
+ in a number of scenarios when facet.mincount>0. This patch also adds some
+ performance/algorithmic improvements when (facet.sort=count && facet.mincount=1
+ && facet.limit=-1) and when (facet.sort=index && facet.mincount>0) (yonik)
+
Other Changes
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/handler/component/FacetComponent.java Sun Apr 17 22:24:45 2011
@@ -222,11 +222,37 @@ public class FacetComponent extends Sear
sreq.params.remove(paramStart + FacetParams.FACET_MINCOUNT);
sreq.params.remove(paramStart + FacetParams.FACET_OFFSET);
- dff.initialLimit = dff.offset + dff.limit;
+ dff.initialLimit = dff.limit <= 0 ? dff.limit : dff.offset + dff.limit;
- if(dff.sort.equals(FacetParams.FACET_SORT_COUNT) && dff.limit > 0) {
- // set the initial limit higher to increase accuracy
- dff.initialLimit = (int)(dff.initialLimit * 1.5) + 10;
+ if (dff.sort.equals(FacetParams.FACET_SORT_COUNT)) {
+ if (dff.limit > 0) {
+ // set the initial limit higher to increase accuracy
+ dff.initialLimit = (int)(dff.initialLimit * 1.5) + 10;
+ dff.initialMincount = 0; // TODO: we could change this to 1, but would then need more refinement for small facet result sets?
+ } else {
+ // if limit==-1, then no need to artificially lower mincount to 0 if it's 1
+ dff.initialMincount = Math.min(dff.minCount, 1);
+ }
+ } else {
+ // we're sorting by index order.
+ // if minCount==0, we should always be able to get accurate results w/o over-requesting or refining
+ // if minCount==1, we should be able to get accurate results w/o over-requesting, but we'll need to refine
+ // if minCount==n (>1), we can set the initialMincount to minCount/nShards, rounded up.
+ // For example, we know that if minCount=10 and we have 3 shards, then at least one shard must have a count of 4 for the term
+ // For the minCount>1 case, we can generate too short of a list (miss terms at the end of the list) unless limit==-1
+ // For example: each shard could produce a list of top 10, but some of those could fail to make it into the combined list (i.e.
+ // we needed to go beyond the top 10 to generate the top 10 combined). Overrequesting can help a little here, but not as
+ // much as when sorting by count.
+ if (dff.minCount <= 1) {
+ dff.initialMincount = dff.minCount;
+ } else {
+ dff.initialMincount = (int)Math.ceil((double)dff.minCount / rb.slices.length);
+ // dff.initialMincount = 1;
+ }
+ }
+
+ if (dff.initialMincount != 0) {
+ sreq.params.set(paramStart + FacetParams.FACET_MINCOUNT, dff.initialMincount);
}
// Currently this is for testing only and allows overriding of the
@@ -296,15 +322,18 @@ public class FacetComponent extends Sear
//
for (DistribFieldFacet dff : fi.facets.values()) {
- if (dff.limit <= 0) continue; // no need to check these facets for refinement
- if (dff.minCount <= 1 && dff.sort.equals(FacetParams.FACET_SORT_INDEX)) continue;
+ // no need to check these facets for refinement
+ if (dff.initialLimit <= 0 && dff.initialMincount == 0) continue;
+
+ // only other case where index-sort doesn't need refinement is if minCount==0
+ if (dff.minCount == 0 && dff.sort.equals(FacetParams.FACET_SORT_INDEX)) continue;
- @SuppressWarnings("unchecked") // generic array's are anoying
+ @SuppressWarnings("unchecked") // generic array's are annoying
List<String>[] tmp = (List<String>[]) new List[rb.shards.length];
dff._toRefine = tmp;
ShardFacetCount[] counts = dff.getCountSorted();
- int ntop = Math.min(counts.length, dff.offset + dff.limit);
+ int ntop = Math.min(counts.length, dff.limit >= 0 ? dff.offset + dff.limit : Integer.MAX_VALUE);
long smallestCount = counts.length == 0 ? 0 : counts[ntop-1].count;
for (int i=0; i<counts.length; i++) {
@@ -313,8 +342,11 @@ public class FacetComponent extends Sear
if (i<ntop) {
// automatically flag the top values for refinement
+ // this should always be true for facet.sort=index
needRefinement = true;
} else {
+ // this logic should only be invoked for facet.sort=index (for now)
+
// calculate the maximum value that this term may have
// and if it is >= smallestCount, then flag for refinement
long maxCount = sfc.count;
@@ -422,13 +454,32 @@ public class FacetComponent extends Sear
counts = dff.getLexSorted();
}
- int end = dff.limit < 0 ? counts.length : Math.min(dff.offset + dff.limit, counts.length);
- for (int i=dff.offset; i<end; i++) {
- if (counts[i].count < dff.minCount) {
- if (countSorted) break; // if sorted by count, we can break out of loop early
- else continue;
+ if (countSorted) {
+ int end = dff.limit < 0 ? counts.length : Math.min(dff.offset + dff.limit, counts.length);
+ for (int i=dff.offset; i<end; i++) {
+ if (counts[i].count < dff.minCount) {
+ break;
+ }
+ fieldCounts.add(counts[i].name, num(counts[i].count));
+ }
+ } else {
+ int off = dff.offset;
+ int lim = dff.limit >= 0 ? dff.limit : Integer.MAX_VALUE;
+
+ // index order...
+ for (int i=0; i<counts.length; i++) {
+ long count = counts[i].count;
+ if (count < dff.minCount) continue;
+ if (off > 0) {
+ off--;
+ continue;
+ }
+ if (lim <= 0) {
+ break;
+ }
+ lim--;
+ fieldCounts.add(counts[i].name, num(count));
}
- fieldCounts.add(counts[i].name, num(counts[i].count));
}
if (dff.missing) {
@@ -631,7 +682,8 @@ public class FacetComponent extends Sear
public HashMap<String,ShardFacetCount> counts = new HashMap<String,ShardFacetCount>(128);
public int termNum;
- public int initialLimit; // how many terms requested in first phase
+ public int initialLimit; // how many terms requested in first phase
+ public int initialMincount; // mincount param sent to each shard
public boolean needRefinements;
public ShardFacetCount[] countSorted;
@@ -671,11 +723,10 @@ public class FacetComponent extends Sear
}
}
- // the largest possible missing term is 0 if we received less
- // than the number requested (provided mincount==0 like it should be for
- // a shard request)
+ // the largest possible missing term is initialMincount if we received less
+ // than the number requested.
if (numRequested<0 || numRequested != 0 && numReceived < numRequested) {
- last = 0;
+ last = initialMincount;
}
missingMaxPossible += last;
Modified: lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java (original)
+++ lucene/dev/trunk/solr/src/test-framework/org/apache/solr/BaseDistributedSearchTestCase.java Sun Apr 17 22:24:45 2011
@@ -323,12 +323,15 @@ public abstract class BaseDistributedSea
protected void query(Object... q) throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
+ params.add("reqid",Integer.toString(random.nextInt())); // just to help correlate top-level requests w/ sub requests
for (int i = 0; i < q.length; i += 2) {
params.add(q[i].toString(), q[i + 1].toString());
}
+ params.add("controlClient","true"); // just to enable easier sorting through log files
final QueryResponse controlRsp = controlClient.query(params);
+ params.remove("controlClient");
setDistributedParams(params);
@@ -418,7 +421,7 @@ public abstract class BaseDistributedSea
break;
}
if (ordered) {
- return "." + namea + "!=" + nameb + " (unordered or missing)";
+ return err("." + namea + "!=" + nameb + " (unordered or missing)");
}
// if unordered, continue until we find the right field.
}
@@ -432,7 +435,7 @@ public abstract class BaseDistributedSea
if (a.size() - aSkipped != b.size() - bSkipped) {
- return ".size()==" + a.size() + "," + b.size() + "skipped=" + aSkipped + "," + bSkipped;
+ return err(".size()==" + a.size() + "," + b.size() + "skipped=" + aSkipped + "," + bSkipped);
}
return null;
@@ -446,7 +449,7 @@ public abstract class BaseDistributedSea
int flagsa = flags(handle, keya);
if ((flagsa & SKIP) != 0) continue;
if (!b.containsKey(keya)) {
- return "[" + keya + "]==null";
+ return err("[" + keya + "]==null");
}
if ((flagsa & SKIPVAL) != 0) continue;
Object valb = b.get(keya);
@@ -478,7 +481,7 @@ public abstract class BaseDistributedSea
} else {
if (b.getMaxScore() != null) {
if (a.getMaxScore() == null) {
- return ".maxScore missing";
+ return err(".maxScore missing");
}
}
}
@@ -524,7 +527,7 @@ public abstract class BaseDistributedSea
public static String compare(Object[] a, Object[] b, int flags, Map<String, Integer> handle) {
if (a.length != b.length) {
- return ".length:" + a.length + "!=" + b.length;
+ return err(".length:" + a.length + "!=" + b.length);
}
for (int i = 0; i < a.length; i++) {
String cmp = compare(a[i], b[i], flags, handle);
@@ -535,7 +538,7 @@ public abstract class BaseDistributedSea
public static String compare(Object a, Object b, int flags, Map<String, Integer> handle) {
if (a == b) return null;
- if (a == null || b == null) return ":" + a + "!=" + b;
+ if (a == null || b == null) return err(":" + a + "!=" + b);
if (a instanceof NamedList && b instanceof NamedList) {
return compare((NamedList) a, (NamedList) b, flags, handle);
@@ -559,7 +562,7 @@ public abstract class BaseDistributedSea
if (a instanceof byte[] && b instanceof byte[]) {
if (!Arrays.equals((byte[]) a, (byte[]) b)) {
- return ":" + a + "!=" + b;
+ return err(":" + a + "!=" + b);
}
return null;
}
@@ -570,12 +573,19 @@ public abstract class BaseDistributedSea
}
if (!(a.equals(b))) {
- return ":" + a + "!=" + b;
+ return err(":" + a + "!=" + b);
}
return null;
}
+ /** This method is called for root level comparison errors and can be helpful to set a
+ * breakpoint in to debug comparison failures.
+ */
+ public static String err(String msg) {
+ return msg;
+ }
+
protected void compareResponses(QueryResponse a, QueryResponse b) {
String cmp;
cmp = compare(a.getResponse(), b.getResponse(), flags, handle);
Modified: lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java?rev=1094214&r1=1094213&r2=1094214&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java (original)
+++ lucene/dev/trunk/solr/src/test/org/apache/solr/TestDistributedSearch.java Sun Apr 17 22:24:45 2011
@@ -48,6 +48,9 @@ public class TestDistributedSearch exten
@Override
public void doTest() throws Exception {
+ int backupStress = stress; // make a copy so we can restore
+
+
del("*:*");
indexr(id,1, i1, 100, tlong, 100,t1,"now is the time for all good men"
,"foo_f", 1.414f, "foo_b", "true", "foo_d", 1.414d);
@@ -132,23 +135,36 @@ public class TestDistributedSearch exten
// then the primary sort should always be a tie and then the secondary should always decide
query("q","{!func}ms(NOW)", "sort","score desc,"+i1+" desc","fl","id");
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1);
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","count");
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","count", "facet.mincount",2);
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","index");
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.limit",-1, "facet.sort","index", "facet.mincount",2);
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.offset",10, "facet.limit",1, "facet.sort","index");
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1,"facet.limit",1);
- query("q","*:*", "rows",100, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*");
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.offset",1);
- query("q","*:*", "rows",100, "facet","true", "facet.field",t1, "facet.mincount",2);
+ query("q","*:*", "rows",0, "facet","true", "facet.field",t1);
+ query("q","*:*", "rows",0, "facet","true", "facet.field",t1,"facet.limit",1);
+ query("q","*:*", "rows",0, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*");
+ query("q","*:*", "rows",0, "facet","true", "facet.field",t1, "facet.mincount",2);
+
+ stress=0; // turn off stress... we want to tex max combos in min time
+ for (int i=0; i<25*RANDOM_MULTIPLIER; i++) {
+ String f = fieldNames[random.nextInt(fieldNames.length)];
+ if (random.nextBoolean()) f = t1; // the text field is a really interesting one to facet on (and it's multi-valued too)
+
+ // we want a random query and not just *:* so we'll get zero counts in facets also
+ // TODO: do a better random query
+ String q = random.nextBoolean() ? "*:*" : "id:(1 3 5 7 9 11 13) OR id:[100 TO " + random.nextInt(50) + "]";
+
+ int nolimit = random.nextBoolean() ? -1 : 10000; // these should be equivalent
+
+ // if limit==-1, we should always get exact matches
+ query("q",q, "rows",0, "facet","true", "facet.field",f, "facet.limit",nolimit, "facet.sort","count", "facet.mincount",random.nextInt(5), "facet.offset",random.nextInt(10));
+ query("q",q, "rows",0, "facet","true", "facet.field",f, "facet.limit",nolimit, "facet.sort","index", "facet.mincount",random.nextInt(5), "facet.offset",random.nextInt(10));
+ // for index sort, we should get exact results for mincount <= 1
+ query("q",q, "rows",0, "facet","true", "facet.field",f, "facet.sort","index", "facet.mincount",random.nextInt(2), "facet.offset",random.nextInt(10), "facet.limit",random.nextInt(11)-1);
+ }
+ stress = backupStress; // restore stress
// test faceting multiple things at once
- query("q","*:*", "rows",100, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*"
+ query("q","*:*", "rows",0, "facet","true", "facet.query","quick", "facet.query","all", "facet.query","*:*"
,"facet.field",t1);
// test filter tagging, facet exclusion, and naming (multi-select facet support)
- query("q","*:*", "rows",100, "facet","true", "facet.query","{!key=myquick}quick", "facet.query","{!key=myall ex=a}all", "facet.query","*:*"
+ query("q","*:*", "rows",0, "facet","true", "facet.query","{!key=myquick}quick", "facet.query","{!key=myall ex=a}all", "facet.query","*:*"
,"facet.field","{!key=mykey ex=a}"+t1
,"facet.field","{!key=other ex=b}"+t1
,"facet.field","{!key=again ex=a,b}"+t1