You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2017/05/24 00:52:50 UTC
lucene-solr:master: SOLR-10634: calc metrics in first phase if
limit=-1 and no subfacets
Repository: lucene-solr
Updated Branches:
refs/heads/master ea9adc055 -> d60c72f34
SOLR-10634: calc metrics in first phase if limit=-1 and no subfacets
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/d60c72f3
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/d60c72f3
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/d60c72f3
Branch: refs/heads/master
Commit: d60c72f34ca9c63ac6075e00dac844c6f052d0a8
Parents: ea9adc0
Author: yonik <yo...@apache.org>
Authored: Tue May 23 20:52:46 2017 -0400
Committer: yonik <yo...@apache.org>
Committed: Tue May 23 20:52:46 2017 -0400
----------------------------------------------------------------------
solr/CHANGES.txt | 8 +
.../solr/search/facet/FacetFieldProcessor.java | 81 +++++++++-
.../solr/collection1/conf/solrconfig-tlog.xml | 8 +
.../org/apache/solr/search/facet/DebugAgg.java | 146 +++++++++++++++++++
.../solr/search/facet/TestJsonFacets.java | 134 +++++++++++++++--
5 files changed, 366 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d60c72f3/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 928a190..5b92e3c 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -231,6 +231,14 @@ Bug Fixes
resulting in exceptions when using a hashing faceting method and sorting by hll(numeric_field).
(yonik)
+Optimizations
+----------------------
+* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)
+ and there are no nested facets, aggregations are computed in the first collection phase
+ so that the second phase which would normally involve calculating the domain for the bucket
+ can be skipped entirely, leading to large performance improvements. (yonik)
+
+
Other Changes
----------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d60c72f3/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
index 143d1fc..7b08e14 100644
--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
@@ -129,7 +129,36 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
sortAcc = indexOrderAcc;
deferredAggs = freq.getFacetStats();
- } else {
+ }
+
+ // If we are going to return all buckets and if there are no subfacets (that would need a domain), then don't defer
+ // any aggregation calculations to a second phase. This way we can avoid calculating domains for each bucket, which
+ // can be expensive.
+ if (freq.limit == -1 && freq.subFacets.size() == 0) {
+ accs = new SlotAcc[ freq.getFacetStats().size() ];
+ int otherAccIdx = 0;
+ for (Map.Entry<String,AggValueSource> entry : freq.getFacetStats().entrySet()) {
+ AggValueSource agg = entry.getValue();
+ SlotAcc acc = agg.createSlotAcc(fcontext, numDocs, numSlots);
+ acc.key = entry.getKey();
+ accMap.put(acc.key, acc);
+ accs[otherAccIdx++] = acc;
+ }
+ if (accs.length == 1) {
+ collectAcc = accs[0];
+ } else {
+ collectAcc = new MultiAcc(fcontext, accs);
+ }
+
+ if (sortAcc == null) {
+ sortAcc = accMap.get(freq.sortVariable);
+ assert sortAcc != null;
+ }
+
+ deferredAggs = null;
+ }
+
+ if (sortAcc == null) {
AggValueSource sortAgg = freq.getFacetStats().get(freq.sortVariable);
if (sortAgg != null) {
collectAcc = sortAgg.createSlotAcc(fcontext, numDocs, numSlots);
@@ -140,7 +169,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
deferredAggs.remove(freq.sortVariable);
}
- if (deferredAggs.size() == 0) {
+ if (deferredAggs == null || deferredAggs.size() == 0) {
deferredAggs = null;
}
@@ -438,6 +467,54 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
}
+ static class MultiAcc extends SlotAcc {
+ final SlotAcc[] subAccs;
+
+ MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) {
+ super(fcontext);
+ this.subAccs = subAccs;
+ }
+
+ @Override
+ public void collect(int doc, int slot) throws IOException {
+ for (SlotAcc acc : subAccs) {
+ acc.collect(doc, slot);
+ }
+ }
+
+ @Override
+ public int compare(int slotA, int slotB) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public Object getValue(int slotNum) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void reset() throws IOException {
+ for (SlotAcc acc : subAccs) {
+ acc.reset();
+ }
+ }
+
+ @Override
+ public void resize(Resizer resizer) {
+ for (SlotAcc acc : subAccs) {
+ acc.resize(resizer);
+ }
+ }
+
+ @Override
+ public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
+ for (SlotAcc acc : subAccs) {
+ acc.setValues(bucket, slotNum);
+ }
+ }
+ }
+
+
static class SpecialSlotAcc extends SlotAcc {
SlotAcc collectAcc;
SlotAcc[] otherAccs;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d60c72f3/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml
index b08af83..cf12c4a 100644
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml
@@ -164,4 +164,12 @@
</lst>
</initParams>
+
+ <valueSourceParser name="nvl" class="org.apache.solr.search.function.NvlValueSourceParser">
+ <float name="nvlFloatValue">0.0</float>
+ </valueSourceParser>
+ <valueSourceParser name="agg_debug" class="org.apache.solr.search.facet.DebugAgg$Parser">
+ </valueSourceParser>
+
+
</config>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d60c72f3/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
new file mode 100644
index 0000000..ad4fabf
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SimpleOrderedMap;
+import org.apache.solr.search.DocSet;
+import org.apache.solr.search.FunctionQParser;
+import org.apache.solr.search.SyntaxError;
+import org.apache.solr.search.ValueSourceParser;
+
+
+public class DebugAgg extends AggValueSource {
+
+ public static class Parser extends ValueSourceParser {
+ @Override
+ public ValueSource parse(FunctionQParser fp) throws SyntaxError {
+ return new DebugAgg();
+ }
+
+ @Override
+ public void init(NamedList args) {
+ }
+ }
+
+
+ public DebugAgg() {
+ super("debug");
+ }
+
+ public DebugAgg(String name) {
+ super(name);
+ }
+
+ @Override
+ public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) {
+ return new Acc(fcontext, numDocs, numSlots);
+ }
+
+ @Override
+ public int hashCode() {
+ return 0;
+ }
+
+ @Override
+ public String description() {
+ return "debug()";
+ }
+
+ public static class Acc extends SlotAcc {
+ public static AtomicLong creates = new AtomicLong(0);
+ public static AtomicLong resets = new AtomicLong(0);
+ public static AtomicLong resizes = new AtomicLong(0);
+ public static Acc last;
+
+ public CountSlotAcc sub;
+ public int numDocs;
+ public int numSlots;
+
+ public Acc(FacetContext fcontext, int numDocs, int numSlots) {
+ super(fcontext);
+ this.last = this;
+ this.numDocs = numDocs;
+ this.numSlots = numSlots;
+ creates.addAndGet(1);
+ sub = new CountSlotArrAcc(fcontext, numSlots);
+
+ new RuntimeException("DEBUG Acc numSlots=" + numSlots).printStackTrace();
+ }
+
+ @Override
+ public void collect(int doc, int slot) throws IOException {
+ sub.collect(doc, slot);
+ }
+
+ @Override
+ public int compare(int slotA, int slotB) {
+ return sub.compare(slotA, slotB);
+ }
+
+ @Override
+ public Object getValue(int slotNum) throws IOException {
+ return sub.getValue(slotNum);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ resets.addAndGet(1);
+ sub.reset();
+ }
+
+ @Override
+ public void resize(Resizer resizer) {
+ resizes.addAndGet(1);
+ this.numSlots = resizer.getNewSize();
+ sub.resize(resizer);
+ }
+
+ @Override
+ public void setNextReader(LeafReaderContext readerContext) throws IOException {
+ sub.setNextReader(readerContext);
+ }
+
+ @Override
+ public int collect(DocSet docs, int slot) throws IOException {
+ return sub.collect(docs, slot);
+ }
+
+ @Override
+ public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
+ sub.key = this.key; // TODO: Blech... this should be fixed
+ sub.setValues(bucket, slotNum);
+ }
+
+ @Override
+ public void close() throws IOException {
+ sub.close();
+ }
+ }
+
+ @Override
+ public FacetMerger createFacetMerger(Object prototype) {
+ return new FacetLongMerger();
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d60c72f3/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
index 3590426..03cc480 100644
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
@@ -471,7 +471,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
doStatsTemplated(client, params(p, "facet","true", "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") );
// multi-valued strings, method=dv for terms facets
- doStatsTemplated(client, params(p, "terms", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));
+ doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));
// single valued docvalues for strings, and single valued numeric doc values for numeric fields
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sd", "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd" ,"multi_ss","multi_sds") );
@@ -491,6 +491,26 @@ public class TestJsonFacets extends SolrTestCaseHS {
if (p.get("num_is") == null) p.add("num_is","num_is");
if (p.get("num_fs") == null) p.add("num_fs","num_fs");
+ String terms = p.get("terms");
+ if (terms == null) terms="";
+ int limit=0;
+ switch (random().nextInt(4)) {
+ case 0: limit=-1;
+ case 1: limit=1000000;
+ case 2: // fallthrough
+ case 3: // fallthrough
+ }
+ if (limit != 0) {
+ terms=terms+"limit:"+limit+",";
+ }
+ String terms_method = p.get("terms_method");
+ if (terms_method != null) {
+ terms=terms+terms_method;
+ }
+ p.set("terms", terms);
+ // "${terms}" should be put at the beginning of generic terms facets.
+ // It may specify "method=..." or "limit:-1", so should not be used if the facet explicitly specifies.
+
MacroExpander m = new MacroExpander( p.getMap() );
String cat_s = m.expand("${cat_s}");
@@ -862,7 +882,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
// test numBuckets
client.testJQ(params(p, "q", "*:*", "rows", "0", "facet", "true"
- , "json.facet", "{f1:{terms:{${terms} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error
+ , "json.facet", "{f1:{terms:{${terms_method} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error
)
, "facets=={ 'count':6, " +
"'f1':{ numBuckets:2, buckets:[{val:B, count:3}]} } "
@@ -1048,10 +1068,10 @@ public class TestJsonFacets extends SolrTestCaseHS {
// also test limit:0
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{" +
- " f0:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " +
- ",f1:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
- ",f2:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
- ",f3:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " +
+ " f0:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " +
+ ",f1:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
+ ",f2:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
+ ",f3:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " +
"}"
)
, "facets=={ 'count':6, " +
@@ -1066,9 +1086,9 @@ public class TestJsonFacets extends SolrTestCaseHS {
// also test limit:0
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{" +
- " f0:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0} " +
- ",f1:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
- ",f2:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
+ " f0:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0} " +
+ ",f1:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
+ ",f2:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
"}"
)
, "facets=={ 'count':6, " +
@@ -1398,6 +1418,102 @@ public class TestJsonFacets extends SolrTestCaseHS {
}
+
+ ////////////////////////////////////////////////////////////////
+ // test which phase stats are calculated in
+ ////////////////////////////////////////////////////////////////
+ if (client.local()) {
+ long creates, resets;
+ // NOTE: these test the current implementation and may need to be adjusted to match future optimizations (such as calculating N buckets in parallel in the second phase)
+
+ creates = DebugAgg.Acc.creates.get();
+ resets = DebugAgg.Acc.resets.get();
+ client.testJQ(params(p, "q", "*:*"
+ , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{x:'debug()'} }}}" // x should be deferred to 2nd phase
+ )
+ , "facets=={ 'count':6, " +
+ "f1:{ buckets:[{ val:batman, count:1, x:1}]} } "
+ );
+
+ assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+ assertTrue( DebugAgg.Acc.resets.get() - resets <= 1);
+ assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably "1", but may be special slot for something. As long as it's not cardinality of the field
+
+
+ creates = DebugAgg.Acc.creates.get();
+ resets = DebugAgg.Acc.resets.get();
+ client.testJQ(params(p, "q", "*:*"
+ , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{ x:'debug()'} , sort:'x asc' }}}" // sorting by x... must be done all at once in first phase
+ )
+ , "facets=={ 'count':6, " +
+ "f1:{ buckets:[{ val:batman, count:1, x:1}]}" +
+ " } "
+ );
+
+ assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+ assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
+ assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
+
+
+ // When limit:-1, we should do most stats in first phase (SOLR-10634)
+ creates = DebugAgg.Acc.creates.get();
+ resets = DebugAgg.Acc.resets.get();
+ client.testJQ(params(p, "q", "*:*"
+ , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{x:'debug()'} }}}"
+ )
+ , "facets=="
+ );
+
+ assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+ assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
+ assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
+
+ // Now for a numeric field
+ // When limit:-1, we should do most stats in first phase (SOLR-10634)
+ creates = DebugAgg.Acc.creates.get();
+ resets = DebugAgg.Acc.resets.get();
+ client.testJQ(params(p, "q", "*:*"
+ , "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{x:'debug()'} }}}"
+ )
+ , "facets=="
+ );
+
+ assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+ assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
+ assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
+
+
+ // But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets)
+ creates = DebugAgg.Acc.creates.get();
+ resets = DebugAgg.Acc.resets.get();
+ client.testJQ(params(p, "q", "*:*"
+ , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}} } }}}"
+ )
+ , "facets=="
+ );
+
+ assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+ assertTrue( DebugAgg.Acc.resets.get() - resets >=4);
+ assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher
+
+ // Now with a numeric field
+ // But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets)
+ creates = DebugAgg.Acc.creates.get();
+ resets = DebugAgg.Acc.resets.get();
+ client.testJQ(params(p, "q", "*:*"
+ , "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}} } }}}"
+ )
+ , "facets=="
+ );
+
+ assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+ assertTrue( DebugAgg.Acc.resets.get() - resets >=4);
+ assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher
+ }
+ //////////////////////////////////////////////////////////////// end phase testing
+
+
+
}
@Test