You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2015/05/01 04:15:02 UTC

svn commit: r1677091 - in /lucene/dev/trunk/solr: CHANGES.txt core/src/java/org/apache/solr/search/facet/SlotAcc.java core/src/java/org/apache/solr/search/facet/UniqueAgg.java core/src/test/org/apache/solr/search/facet/TestJsonFacets.java

Author: yonik
Date: Fri May  1 02:15:02 2015
New Revision: 1677091

URL: http://svn.apache.org/r1677091
Log:
SOLR-7494: fix unique() function for high cardinality fields

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1677091&r1=1677090&r2=1677091&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Fri May  1 02:15:02 2015
@@ -233,6 +233,10 @@ Bug Fixes
 * SOLR-7478: UpdateLog#close shuts down it's executor with interrupts before running it's close logic,
   possibly preventing a clean close. (Mark Miller)
 
+* SOLR-7494: Facet Module - unique() facet function was wildly inaccurate for high cardinality
+  fields. (Andy Crossen, yonik)
+
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java?rev=1677091&r1=1677090&r2=1677091&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java Fri May  1 02:15:02 2015
@@ -386,17 +386,19 @@ abstract class UniqueSlotAcc extends Slo
     int maxExplicit=100;
     // TODO: make configurable
     // TODO: share values across buckets
-    if (unique <= maxExplicit) {
+    if (unique > 0) {
+
       List lst = new ArrayList( Math.min(unique, maxExplicit) );
 
-      if (ords != null) {
-        for (int ord=-1;;) {
-          if (++ord >= unique) break;
+      long maxOrd = ords.length();
+      if (ords != null && ords.length() > 0) {
+        for (int ord=0; lst.size() < maxExplicit;) {
           ord = ords.nextSetBit(ord);
           if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
           BytesRef val = lookupOrd(ord);
           Object o = field.getType().toObject(field, val);
           lst.add(o);
+          if (++ord >= maxOrd) break;
         }
       }
 
@@ -556,43 +558,6 @@ class UniqueMultivaluedSlotAcc extends U
   }
 
   @Override
-  public Object getShardValue(int slot) throws IOException {
-    FixedBitSet ords = arr[slot];
-    int unique;
-    if (counts != null) {
-      unique = counts[slot];
-    } else {
-      unique = ords == null ? 0 : ords.cardinality();
-    }
-
-    SimpleOrderedMap map = new SimpleOrderedMap();
-    map.add("unique", unique);
-    map.add("nTerms", nTerms);
-
-    int maxExplicit=100;
-    // TODO: make configurable
-    // TODO: share values across buckets
-    if (unique <= maxExplicit) {
-      List lst = new ArrayList( Math.min(unique, maxExplicit) );
-
-      if (ords != null) {
-        for (int ord=-1;;) {
-          if (++ord >= unique) break;
-          ord = ords.nextSetBit(ord);
-          if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
-          BytesRef val = docToTerm.lookupOrd(ord);
-          Object o = field.getType().toObject(field, val);
-          lst.add(o);
-        }
-      }
-
-      map.add("vals", lst);
-    }
-
-    return map;
-  }
-
-  @Override
   protected BytesRef lookupOrd(int ord) throws IOException {
     return docToTerm.lookupOrd(ord);
   }

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java?rev=1677091&r1=1677090&r2=1677091&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java Fri May  1 02:15:02 2015
@@ -67,9 +67,10 @@ public class UniqueAgg extends StrAggVal
   }
 
   private static class Merger extends FacetSortableMerger {
+    long answer = -1;
     long sumUnique;
     Set<Object> values;
-    int shardsMissing;
+    long sumAdded;
     long shardsMissingSum;
     long shardsMissingMax;
 
@@ -79,24 +80,35 @@ public class UniqueAgg extends StrAggVal
       long unique = ((Number)map.get("unique")).longValue();
       sumUnique += unique;
 
-      List vals = (List)map.get("vals");
+      int valsListed = 0;
+      List vals = (List) map.get("vals");
       if (vals != null) {
         if (values == null) {
           values = new HashSet<>(vals.size()*4);
         }
         values.addAll(vals);
-      } else {
-        shardsMissing++;
-        shardsMissingSum += unique;
-        shardsMissingMax = Math.max(shardsMissingMax, unique);
+        valsListed = vals.size();
+        sumAdded += valsListed;
       }
 
+      shardsMissingSum += unique - valsListed;
+      shardsMissingMax = Math.max(shardsMissingMax, unique - valsListed);
       // TODO: somehow get & use the count in the bucket?
     }
 
     private long getLong() {
-      long exactCount = values == null ? 0 : values.size();
-      return exactCount + shardsMissingSum;
+      if (answer >= 0) return answer;
+      answer = values == null ? 0 : values.size();
+      if (answer == 0) {
+        // either a real "0", or no values returned from shards
+        answer = shardsMissingSum;
+        return answer;
+      }
+
+      double factor = ((double)values.size()) / sumAdded;  // what fraction of listed values were unique
+      long estimate = (long)(shardsMissingSum * factor);
+      answer = values.size() + estimate;
+      return answer;
     }
 
     @Override

Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java?rev=1677091&r1=1677090&r2=1677091&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java Fri May  1 02:15:02 2015
@@ -886,16 +886,76 @@ public class TestJsonFacets extends Solr
 
   }
 
-
   @Test
   public void testDistrib() throws Exception {
     initServers();
-    Client client = servers.getClient( random().nextInt() );
+    Client client = servers.getClient(random().nextInt());
     client.queryDefaults().set( "shards", servers.getShards() );
     doStats( client, params() );
   }
 
 
+  @Test
+  public void testBigger() throws Exception {
+    ModifiableSolrParams p = params("rows", "0", "cat_s", "cat_ss", "where_s", "where_ss");
+    //    doBigger(Client.localClient, p);
+
+    initServers();
+    Client client = servers.getClient(random().nextInt());
+    client.queryDefaults().set( "shards", servers.getShards() );
+    doBigger( client, p );
+  }
+
+  public void doBigger(Client client, ModifiableSolrParams p) throws Exception {
+    MacroExpander m = new MacroExpander(p.getMap());
+
+    String cat_s = m.expand("${cat_s}");
+    String where_s = m.expand("${where_s}");
+
+    client.deleteByQuery("*:*", null);
+
+    Random r = new Random(0);  // make deterministic
+    int numCat = 1;
+    int numWhere = 2000000000;
+    int commitPercent = 10;
+    int ndocs=1000;
+
+    Map<Integer, Map<Integer, List<Integer>>> model = new HashMap();  // cat->where->list<ids>
+    for (int i=0; i<ndocs; i++) {
+      Integer cat = r.nextInt(numCat);
+      Integer where = r.nextInt(numWhere);
+      client.add( sdoc("id", i, cat_s,cat, where_s, where) , null );
+      Map<Integer,List<Integer>> sub = model.get(cat);
+      if (sub == null) {
+        sub = new HashMap<>();
+        model.put(cat, sub);
+      }
+      List<Integer> ids = sub.get(where);
+      if (ids == null) {
+        ids = new ArrayList<>();
+        sub.put(where, ids);
+      }
+      ids.add(i);
+
+      if (r.nextInt(100) < commitPercent) {
+        client.commit();
+      }
+    }
+
+    client.commit();
+
+    int sz = model.get(0).size();
+
+    client.testJQ(params(p, "q", "*:*"
+            , "json.facet", "{f1:{type:terms, field:${cat_s}, limit:2, facet:{x:'unique($where_s)'}  }}"
+        )
+        , "facets=={ 'count':" + ndocs + "," +
+            "'f1':{  'buckets':[{ 'val':'0', 'count':" + ndocs + ", x:" + sz + " }]} } "
+    );
+  }
+
+
+
   public void XtestPercentiles() {
     AVLTreeDigest catA = new AVLTreeDigest(100);
     catA.add(4);