You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2015/05/01 04:15:36 UTC
svn commit: r1677092 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/core/ solr/core/src/java/org/apache/solr/search/facet/
solr/core/src/test/org/apache/solr/search/facet/
Author: yonik
Date: Fri May 1 02:15:36 2015
New Revision: 1677092
URL: http://svn.apache.org/r1677092
Log:
SOLR-7494: fix unique() function for high cardinality fields
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/core/ (props changed)
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java
lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1677092&r1=1677091&r2=1677092&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Fri May 1 02:15:36 2015
@@ -176,6 +176,10 @@ Bug Fixes
* SOLR-7478: UpdateLog#close shuts down it's executor with interrupts before running it's close logic,
possibly preventing a clean close. (Mark Miller)
+* SOLR-7494: Facet Module - unique() facet function was wildly inaccurate for high cardinality
+ fields. (Andy Crossen, yonik)
+
+
Optimizations
----------------------
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java?rev=1677092&r1=1677091&r2=1677092&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java Fri May 1 02:15:36 2015
@@ -386,17 +386,19 @@ abstract class UniqueSlotAcc extends Slo
int maxExplicit=100;
// TODO: make configurable
// TODO: share values across buckets
- if (unique <= maxExplicit) {
+ if (unique > 0) {
+
List lst = new ArrayList( Math.min(unique, maxExplicit) );
- if (ords != null) {
- for (int ord=-1;;) {
- if (++ord >= unique) break;
+ long maxOrd = ords.length();
+ if (ords != null && ords.length() > 0) {
+ for (int ord=0; lst.size() < maxExplicit;) {
ord = ords.nextSetBit(ord);
if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
BytesRef val = lookupOrd(ord);
Object o = field.getType().toObject(field, val);
lst.add(o);
+ if (++ord >= maxOrd) break;
}
}
@@ -556,43 +558,6 @@ class UniqueMultivaluedSlotAcc extends U
}
@Override
- public Object getShardValue(int slot) throws IOException {
- FixedBitSet ords = arr[slot];
- int unique;
- if (counts != null) {
- unique = counts[slot];
- } else {
- unique = ords == null ? 0 : ords.cardinality();
- }
-
- SimpleOrderedMap map = new SimpleOrderedMap();
- map.add("unique", unique);
- map.add("nTerms", nTerms);
-
- int maxExplicit=100;
- // TODO: make configurable
- // TODO: share values across buckets
- if (unique <= maxExplicit) {
- List lst = new ArrayList( Math.min(unique, maxExplicit) );
-
- if (ords != null) {
- for (int ord=-1;;) {
- if (++ord >= unique) break;
- ord = ords.nextSetBit(ord);
- if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
- BytesRef val = docToTerm.lookupOrd(ord);
- Object o = field.getType().toObject(field, val);
- lst.add(o);
- }
- }
-
- map.add("vals", lst);
- }
-
- return map;
- }
-
- @Override
protected BytesRef lookupOrd(int ord) throws IOException {
return docToTerm.lookupOrd(ord);
}
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java?rev=1677092&r1=1677091&r2=1677092&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java Fri May 1 02:15:36 2015
@@ -67,9 +67,10 @@ public class UniqueAgg extends StrAggVal
}
private static class Merger extends FacetSortableMerger {
+ long answer = -1;
long sumUnique;
Set<Object> values;
- int shardsMissing;
+ long sumAdded;
long shardsMissingSum;
long shardsMissingMax;
@@ -79,24 +80,35 @@ public class UniqueAgg extends StrAggVal
long unique = ((Number)map.get("unique")).longValue();
sumUnique += unique;
- List vals = (List)map.get("vals");
+ int valsListed = 0;
+ List vals = (List) map.get("vals");
if (vals != null) {
if (values == null) {
values = new HashSet<>(vals.size()*4);
}
values.addAll(vals);
- } else {
- shardsMissing++;
- shardsMissingSum += unique;
- shardsMissingMax = Math.max(shardsMissingMax, unique);
+ valsListed = vals.size();
+ sumAdded += valsListed;
}
+ shardsMissingSum += unique - valsListed;
+ shardsMissingMax = Math.max(shardsMissingMax, unique - valsListed);
// TODO: somehow get & use the count in the bucket?
}
private long getLong() {
- long exactCount = values == null ? 0 : values.size();
- return exactCount + shardsMissingSum;
+ if (answer >= 0) return answer;
+ answer = values == null ? 0 : values.size();
+ if (answer == 0) {
+ // either a real "0", or no values returned from shards
+ answer = shardsMissingSum;
+ return answer;
+ }
+
+ double factor = ((double)values.size()) / sumAdded; // what fraction of listed values were unique
+ long estimate = (long)(shardsMissingSum * factor);
+ answer = values.size() + estimate;
+ return answer;
}
@Override
Modified: lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java?rev=1677092&r1=1677091&r2=1677092&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java Fri May 1 02:15:36 2015
@@ -886,16 +886,76 @@ public class TestJsonFacets extends Solr
}
-
@Test
public void testDistrib() throws Exception {
initServers();
- Client client = servers.getClient( random().nextInt() );
+ Client client = servers.getClient(random().nextInt());
client.queryDefaults().set( "shards", servers.getShards() );
doStats( client, params() );
}
+ @Test
+ public void testBigger() throws Exception {
+ ModifiableSolrParams p = params("rows", "0", "cat_s", "cat_ss", "where_s", "where_ss");
+ // doBigger(Client.localClient, p);
+
+ initServers();
+ Client client = servers.getClient(random().nextInt());
+ client.queryDefaults().set( "shards", servers.getShards() );
+ doBigger( client, p );
+ }
+
+ public void doBigger(Client client, ModifiableSolrParams p) throws Exception {
+ MacroExpander m = new MacroExpander(p.getMap());
+
+ String cat_s = m.expand("${cat_s}");
+ String where_s = m.expand("${where_s}");
+
+ client.deleteByQuery("*:*", null);
+
+ Random r = new Random(0); // make deterministic
+ int numCat = 1;
+ int numWhere = 2000000000;
+ int commitPercent = 10;
+ int ndocs=1000;
+
+ Map<Integer, Map<Integer, List<Integer>>> model = new HashMap(); // cat->where->list<ids>
+ for (int i=0; i<ndocs; i++) {
+ Integer cat = r.nextInt(numCat);
+ Integer where = r.nextInt(numWhere);
+ client.add( sdoc("id", i, cat_s,cat, where_s, where) , null );
+ Map<Integer,List<Integer>> sub = model.get(cat);
+ if (sub == null) {
+ sub = new HashMap<>();
+ model.put(cat, sub);
+ }
+ List<Integer> ids = sub.get(where);
+ if (ids == null) {
+ ids = new ArrayList<>();
+ sub.put(where, ids);
+ }
+ ids.add(i);
+
+ if (r.nextInt(100) < commitPercent) {
+ client.commit();
+ }
+ }
+
+ client.commit();
+
+ int sz = model.get(0).size();
+
+ client.testJQ(params(p, "q", "*:*"
+ , "json.facet", "{f1:{type:terms, field:${cat_s}, limit:2, facet:{x:'unique($where_s)'} }}"
+ )
+ , "facets=={ 'count':" + ndocs + "," +
+ "'f1':{ 'buckets':[{ 'val':'0', 'count':" + ndocs + ", x:" + sz + " }]} } "
+ );
+ }
+
+
+
public void XtestPercentiles() {
AVLTreeDigest catA = new AVLTreeDigest(100);
catA.add(4);