You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by th...@apache.org on 2021/06/30 14:41:03 UTC

[solr] branch main updated: SOLR-15475: Implement COUNT and APPROX_COUNT_DISTINCT aggregation functions for Parallel SQL (#194)

This is an automated email from the ASF dual-hosted git repository.

thelabdude pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new daa8a54  SOLR-15475: Implement COUNT and APPROX_COUNT_DISTINCT aggregation functions for Parallel SQL (#194)
daa8a54 is described below

commit daa8a549f500028f0926c5e2b160eaf547a30cfb
Author: Timothy Potter <th...@gmail.com>
AuthorDate: Wed Jun 30 08:40:53 2021 -0600

    SOLR-15475: Implement COUNT and APPROX_COUNT_DISTINCT aggregation functions for Parallel SQL (#194)
---
 solr/CHANGES.txt                                   |  6 ++-
 .../org/apache/solr/handler/sql/SolrAggregate.java | 38 ++++++++++--------
 .../java/org/apache/solr/handler/sql/SolrRel.java  |  5 ++-
 .../org/apache/solr/handler/sql/SolrTable.java     | 11 +++++-
 .../org/apache/solr/handler/TestSQLHandler.java    | 40 +++++++++++++++++++
 .../solr-ref-guide/src/parallel-sql-interface.adoc | 45 ++++++++++++++--------
 .../io/stream/metrics/CountDistinctMetric.java     | 11 +++++-
 .../solrj/io/stream/metrics/CountMetric.java       |  1 +
 8 files changed, 116 insertions(+), 41 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 47389a4..86ea9a2 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -143,16 +143,18 @@ when told to. The admin UI now tells it to. (Nazerke Seidan, David Smiley)
 
 * SOLR-15362: Let core and collection dropdowns in Admin UI float wide to see entire core or collection name.  (Matthias Krepp, Eric Pugh)
 
-* SOLR-15460: Implement LIKE, IS NOT NULL, IS NULL, and support wildcard * in equals string literal for Parallel SQL (Timothy Potter, Houston Putman)
-
 * SOLR-15044: When indexing nested docs via JSON, it is no longer necessary to provide child doc IDs.
   This was already working for XML & "javabin"/SolrJ.  Previously, omitting the ID would be confused
   for a partial/atomic update.  (David Smiley)
 
+* SOLR-15460: Implement LIKE, IS NOT NULL, IS NULL, and support wildcard * in equals string literal for Parallel SQL (Timothy Potter, Houston Putman)
+
 * SOLR-15456: Get field type info from luke for custom fields instead of defaulting to String in Parallel SQL (Timothy Potter)
 
 * SOLR-15489: Implement OFFSET & FETCH for LIMIT SQL queries (Timothy Potter)
 
+* SOLR-15475: Implement COUNT and APPROX_COUNT_DISTINCT aggregation functions for Parallel SQL (Timothy Potter)
+
 Other Changes
 ----------------------
 * SOLR-14656: Autoscaling framework removed (Ishan Chattopadhyaya, noble, Ilan Ginzburg)
diff --git a/solr/core/src/java/org/apache/solr/handler/sql/SolrAggregate.java b/solr/core/src/java/org/apache/solr/handler/sql/SolrAggregate.java
index 6ae02e0..e1c6a6f 100644
--- a/solr/core/src/java/org/apache/solr/handler/sql/SolrAggregate.java
+++ b/solr/core/src/java/org/apache/solr/handler/sql/SolrAggregate.java
@@ -16,6 +16,10 @@
  */
 package org.apache.solr.handler.sql;
 
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
 import org.apache.calcite.plan.RelOptCluster;
 import org.apache.calcite.plan.RelTraitSet;
 import org.apache.calcite.rel.RelNode;
@@ -27,7 +31,8 @@ import org.apache.calcite.sql.fun.SqlStdOperatorTable;
 import org.apache.calcite.util.ImmutableBitSet;
 import org.apache.calcite.util.Pair;
 
-import java.util.*;
+import static org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric.APPROX_COUNT_DISTINCT;
+import static org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric.COUNT_DISTINCT;
 
 /**
  * Implementation of {@link org.apache.calcite.rel.core.Aggregate} relational expression in Solr.
@@ -42,6 +47,13 @@ class SolrAggregate extends Aggregate implements SolrRel {
       SqlStdOperatorTable.AVG
   );
 
+  // Returns the Solr agg metric identifier (includes column) for the SQL metric
+  static String solrAggMetricId(String metric, String column) {
+    // CountDistinctMetric's getIdentifer returns "countDist" but all others return a lowercased value
+    String funcName = COUNT_DISTINCT.equals(metric) ? COUNT_DISTINCT : metric.toLowerCase(Locale.ROOT);
+    return String.format(Locale.ROOT, "%s(%s)", funcName, column);
+  }
+
   SolrAggregate(
       RelOptCluster cluster,
       RelTraitSet traitSet,
@@ -72,25 +84,19 @@ class SolrAggregate extends Aggregate implements SolrRel {
     implementor.visitChild(0, getInput());
 
     final List<String> inNames = SolrRules.solrFieldNames(getInput().getRowType());
-
-
-    for(Pair<AggregateCall, String> namedAggCall : getNamedAggCalls()) {
+    for (Pair<AggregateCall, String> namedAggCall : getNamedAggCalls()) {
 
       AggregateCall aggCall = namedAggCall.getKey();
-
       Pair<String, String> metric = toSolrMetric(implementor, aggCall, inNames);
-      implementor.addReverseAggMapping(namedAggCall.getValue(), metric.getKey().toLowerCase(Locale.ROOT)+"("+metric.getValue()+")");
-      implementor.addMetricPair(namedAggCall.getValue(), metric.getKey(), metric.getValue());
-      /*
-      if(aggCall.getName() == null) {
-        System.out.println("AGG:"+namedAggCall.getValue()+":"+ aggCall.getAggregation().getName() + "(" + inNames.get(aggCall.getArgList().get(0)) + ")");
-        implementor.addFieldMapping(namedAggCall.getValue(),
-          aggCall.getAggregation().getName() + "(" + inNames.get(aggCall.getArgList().get(0)) + ")");
-      }
-      */
+
+      boolean isDistinct = SqlStdOperatorTable.COUNT.equals(aggCall.getAggregation()) && aggCall.isDistinct();
+      // map the SQL COUNT to either countDist or hll for distinct ops, otherwise, the metric names map over directly
+      String metricKey = isDistinct ? (aggCall.isApproximate() ? APPROX_COUNT_DISTINCT : COUNT_DISTINCT) : metric.getKey();
+      implementor.addReverseAggMapping(namedAggCall.getValue(), solrAggMetricId(metricKey, metric.getValue()));
+      implementor.addMetricPair(namedAggCall.getValue(), metricKey, metric.getValue());
     }
 
-    for(int group : getGroupSet()) {
+    for (int group : getGroupSet()) {
       String inName = inNames.get(group);
       implementor.addBucket(inName);
     }
@@ -108,7 +114,7 @@ class SolrAggregate extends Aggregate implements SolrRel {
       case 1:
         String inName = inNames.get(args.get(0));
         String name = implementor.fieldMappings.getOrDefault(inName, inName);
-        if(SUPPORTED_AGGREGATIONS.contains(aggregation)) {
+        if (SUPPORTED_AGGREGATIONS.contains(aggregation)) {
           return new Pair<>(aggregation.getName(), name);
         }
       default:
diff --git a/solr/core/src/java/org/apache/solr/handler/sql/SolrRel.java b/solr/core/src/java/org/apache/solr/handler/sql/SolrRel.java
index 3467e4a..db34c43 100644
--- a/solr/core/src/java/org/apache/solr/handler/sql/SolrRel.java
+++ b/solr/core/src/java/org/apache/solr/handler/sql/SolrRel.java
@@ -23,6 +23,8 @@ import org.apache.calcite.util.Pair;
 
 import java.util.*;
 
+import static org.apache.solr.handler.sql.SolrAggregate.solrAggMetricId;
+
 /**
  * Relational expression that uses Solr calling convention.
  */
@@ -84,9 +86,8 @@ interface SolrRel extends RelNode {
       column = this.fieldMappings.getOrDefault(column, column);
       this.metricPairs.add(new Pair<>(metric, column));
 
-      String metricIdentifier = metric.toLowerCase(Locale.ROOT) + "(" + column + ")";
       if(outName != null) {
-        this.addFieldMapping(outName, metricIdentifier, true);
+        this.addFieldMapping(outName, solrAggMetricId(metric, column), true);
       }
     }
 
diff --git a/solr/core/src/java/org/apache/solr/handler/sql/SolrTable.java b/solr/core/src/java/org/apache/solr/handler/sql/SolrTable.java
index b3b0d56..00d01b3 100644
--- a/solr/core/src/java/org/apache/solr/handler/sql/SolrTable.java
+++ b/solr/core/src/java/org/apache/solr/handler/sql/SolrTable.java
@@ -55,6 +55,9 @@ import org.apache.solr.common.params.ModifiableSolrParams;
 import java.io.IOException;
 import java.util.*;
 import java.util.stream.Collectors;
+
+import static org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric.APPROX_COUNT_DISTINCT;
+import static org.apache.solr.client.solrj.io.stream.metrics.CountDistinctMetric.COUNT_DISTINCT;
 import static org.apache.solr.common.params.CommonParams.SORT;
 
 /**
@@ -236,6 +239,10 @@ class SolrTable extends AbstractQueryableTable implements TranslatableTable {
 
   private Metric getMetric(Pair<String, String> metricPair) {
     switch (metricPair.getKey()) {
+      case COUNT_DISTINCT:
+        return new CountDistinctMetric(metricPair.getValue());
+      case APPROX_COUNT_DISTINCT:
+        return new CountDistinctMetric(metricPair.getValue(), true);
       case "COUNT":
         return new CountMetric(metricPair.getValue());
       case "SUM":
@@ -610,7 +617,7 @@ class SolrTable extends AbstractQueryableTable implements TranslatableTable {
     } else {
       for(Metric metric : metrics) {
         Class<?> c = fmap.get(metric.getIdentifier());
-        if(Long.class.equals(c)) {
+        if (!metric.outputLong && Long.class.equals(c)) {
           metric.outputLong = true;
         }
       }
@@ -833,7 +840,7 @@ class SolrTable extends AbstractQueryableTable implements TranslatableTable {
 
     for(Metric metric : metrics) {
       Class<?> c = fmap.get(metric.getIdentifier());
-      if(Long.class.equals(c)) {
+      if (!metric.outputLong && Long.class.equals(c)) {
         metric.outputLong = true;
       }
     }
diff --git a/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java b/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java
index 68defe7..2a410d7 100644
--- a/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java
+++ b/solr/core/src/test/org/apache/solr/handler/TestSQLHandler.java
@@ -2178,4 +2178,44 @@ public class TestSQLHandler extends SolrCloudTestCase {
     // Solr doesn't support OFFSET w/o LIMIT
     expectThrows(IOException.class, () -> expectResults("SELECT id FROM $ALIAS ORDER BY id DESC OFFSET 5", 5));
   }
+
+  @Test
+  public void testCountDistinct() throws Exception {
+    UpdateRequest updateRequest = new UpdateRequest();
+    final int cardinality = 5;
+    final int maxDocs = 100; // keep this an even # b/c we divide by 2 in this test
+    final String padFmt = "%03d";
+    for (int i = 0; i < maxDocs; i++) {
+      updateRequest = addDocForDistinctTests(i, updateRequest, cardinality, padFmt);
+    }
+    updateRequest.commit(cluster.getSolrClient(), COLLECTIONORALIAS);
+
+    List<Tuple> tuples = expectResults("SELECT COUNT(1) AS total_rows, COUNT(distinct str_s) AS distinct_str, MIN(str_s) AS min_str, MAX(str_s) AS max_str FROM $ALIAS", 1);
+    Tuple firstRow = tuples.get(0);
+    assertEquals(maxDocs, (long) firstRow.getLong("total_rows"));
+    assertEquals(cardinality, (long) firstRow.getLong("distinct_str"));
+
+    String expectedMin = String.format(Locale.ROOT, padFmt, 0);
+    String expectedMax = String.format(Locale.ROOT, padFmt, cardinality - 1); // max is card-1
+    assertEquals(expectedMin, firstRow.getString("min_str"));
+    assertEquals(expectedMax, firstRow.getString("max_str"));
+
+    tuples = expectResults("SELECT DISTINCT str_s FROM $ALIAS ORDER BY str_s ASC", cardinality);
+    for (int t = 0; t < tuples.size(); t++) {
+      assertEquals(String.format(Locale.ROOT, padFmt, t), tuples.get(t).getString("str_s"));
+    }
+
+    tuples = expectResults("SELECT APPROX_COUNT_DISTINCT(distinct str_s) AS approx_distinct FROM $ALIAS", 1);
+    firstRow = tuples.get(0);
+    assertEquals(cardinality, (long) firstRow.getLong("approx_distinct"));
+
+    tuples = expectResults("SELECT country_s, COUNT(*) AS count_per_bucket FROM $ALIAS GROUP BY country_s", 2);
+    assertEquals(maxDocs/2L, (long)tuples.get(0).getLong("count_per_bucket"));
+    assertEquals(maxDocs/2L, (long)tuples.get(1).getLong("count_per_bucket"));
+  }
+
+  private UpdateRequest addDocForDistinctTests(int id, UpdateRequest updateRequest, int cardinality, String padFmt) {
+    String country = id % 2 == 0 ? "US" : "CA";
+    return updateRequest.add("id", String.valueOf(id), "str_s", String.format(Locale.ROOT, padFmt, id % cardinality), "country_s", country);
+  }
 }
diff --git a/solr/solr-ref-guide/src/parallel-sql-interface.adoc b/solr/solr-ref-guide/src/parallel-sql-interface.adoc
index 9947835..1c5b300 100644
--- a/solr/solr-ref-guide/src/parallel-sql-interface.adoc
+++ b/solr/solr-ref-guide/src/parallel-sql-interface.adoc
@@ -256,14 +256,14 @@ The parallel SQL interface supports and pushes down most common SQL operators, s
 |IN |Specify multiple values (shorthand for multiple OR clasues) |`fielda IN (10,20,30)` |`(fielda:10 OR fielda:20 OR fielda:30)`
 |LIKE |Wildcard match on string or text fields |`fielda LIKE 'day%'` |`{!complexphrase}fielda:"day*"`
 |BETWEEN |Range match |`fielda BETWEEN 2 AND 4` |`fielda: [2 TO 4]`
-|IS NULL |Match columns with null value |`fielda IS NULL` |`(*:* -field:*)`
+|IS NULL |Match columns with null value |`fielda IS NULL` |+++(*:* -field:*)+++
 |IS NOT NULL |Match columns with value |`fielda IS NOT NULL` |`field:*`
 |===
 
 * IN, LIKE, BETWEEN support the NOT keyword to find rows where the condition is not true, such as `fielda NOT LIKE 'day%'`
 * String literals must be wrapped in single-quotes; double-quotes indicate database objects and not a string literal.
 * A simplistic LIKE can be used with an asterisk wildcard, such as `field = 'sam*'`; this is Solr specific and not part of the SQL standard.
-* When performing ANDed range queries over a multi-valued field, Apache Calcite short-circuits to zero results if the ANDed predicates appear to be disjoint sets. For example, `b_is <= 2 AND b_is >= 5` appears to Calcite to be disjoint sets, which they are from a single-valued field perspective. However, this may not be the case with multi-valued fields, as Solr might match documents. The work-around is to use Solr query syntax directly inside of an equals expression wrapped in parens: ` [...]
+* When performing ANDed range queries over a multi-valued field, Apache Calcite short-circuits to zero results if the ANDed predicates appear to be disjoint sets. For example, +++b_is <= 2 AND b_is >= 5+++ appears to Calcite to be disjoint sets, which they are from a single-valued field perspective. However, this may not be the case with multi-valued fields, as Solr might match documents. The work-around is to use Solr query syntax directly inside of an equals expression wrapped in paren [...]
 
 === ORDER BY Clause
 
@@ -316,15 +316,18 @@ SELECT distinct fieldA as fa, fieldB as fb FROM tableA ORDER BY fa desc, fb desc
 
 === Statistical Functions
 
-The SQL interface supports simple statistics calculated on numeric fields. The supported functions are `count(*)`, `min`, `max`, `sum`, and `avg`.
+The SQL interface supports simple statistics calculated on numeric fields.
+The supported functions are `COUNT(*)`, `COUNT(DISTINCT field)`, `APPROX_COUNT_DISTINCT(field)`, `MIN`, `MAX`, `SUM`, and `AVG`.
 
 Because these functions never require data to be shuffled, the aggregations are pushed down into the search engine and are generated by the <<the-stats-component.adoc#,StatsComponent>>.
 
 [source,sql]
 ----
-SELECT count(*) as count, sum(fieldB) as sum FROM tableA WHERE fieldC = 'Hello'
+SELECT COUNT(*) as count, SUM(fieldB) as sum FROM tableA WHERE fieldC = 'Hello'
 ----
 
+The `APPROX_COUNT_DISTINCT` metric uses Solr's HyperLogLog (hll) statistical function to compute an approximate cardinality for the given field and should be used when query performance is important and an exact count is not needed.
+
 === GROUP BY Aggregations
 
 The SQL interface also supports `GROUP BY` aggregate queries.
@@ -337,8 +340,13 @@ Here is a basic example of a GROUP BY query that requests aggregations:
 
 [source,sql]
 ----
-SELECT fieldA as fa, fieldB as fb, count(*) as count, sum(fieldC) as sum, avg(fieldY) as avg FROM tableA WHERE fieldC = 'term1 term2'
-GROUP BY fa, fb HAVING sum > 1000 ORDER BY sum asc LIMIT 100
+  SELECT fieldA as fa, fieldB as fb, COUNT(*) as count, SUM(fieldC) as sum, AVG(fieldY) as avg
+    FROM tableA
+   WHERE fieldC = 'term1 term2'
+GROUP BY fa, fb
+  HAVING sum > 1000
+ORDER BY sum asc
+   LIMIT 100
 ----
 
 Let's break this down into pieces:
@@ -347,27 +355,30 @@ Let's break this down into pieces:
 
 The Column Identifiers can contain both fields in the Solr index and aggregate functions. The supported aggregate functions are:
 
-* `count(*)`: Counts the number of records over a set of buckets.
-* `sum(field)`: Sums a numeric field over over a set of buckets.
-* `avg(field)`: Averages a numeric field over a set of buckets.
-* `min(field)`: Returns the min value of a numeric field over a set of buckets.
-* `max:(field)`: Returns the max value of a numerics over a set of buckets.
+* `COUNT(*)`: Counts the number of records over a set of buckets.
+* `SUM(field)`: Sums a numeric field over over a set of buckets.
+* `AVG(field)`: Averages a numeric field over a set of buckets.
+* `MIN(field)`: Returns the min value of a numeric field over a set of buckets.
+* `MAX(field)`: Returns the max value of a numerics over a set of buckets.
 
 The non-function fields in the field list determine the fields to calculate the aggregations over.
 
+Computing the number of distinct values for a specific field within each group using `COUNT(DISTINCT <field>)` is not currently supported by Solr;
+only `COUNT(*)` can be computed for each GROUP BY dimension.
+
 === HAVING Clause
 
 The `HAVING` clause may contain any function listed in the field list. Complex `HAVING` clauses such as this are supported:
 
 [source,sql]
 ----
-SELECT fieldA, fieldB, count(*), sum(fieldC), avg(fieldY)
-FROM tableA
-WHERE fieldC = 'term1 term2'
+  SELECT fieldA, fieldB, COUNT(*), SUM(fieldC), AVG(fieldY)
+    FROM tableA
+   WHERE fieldC = 'term1 term2'
 GROUP BY fieldA, fieldB
-HAVING ((sum(fieldC) > 1000) AND (avg(fieldY) <= 10))
-ORDER BY sum(fieldC) asc
-LIMIT 100
+  HAVING ((SUM(fieldC) > 1000) AND (AVG(fieldY) <= 10))
+ORDER BY SUM(fieldC) ASC
+   LIMIT 100
 ----
 
 == Best Practices
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountDistinctMetric.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountDistinctMetric.java
index 623fc22..8fe5bff 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountDistinctMetric.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountDistinctMetric.java
@@ -26,10 +26,17 @@ import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
 
 public class CountDistinctMetric extends Metric {
 
+    public static final String COUNT_DISTINCT = "countDist";
+    public static final String APPROX_COUNT_DISTINCT = "hll";
+
     private String columnName;
 
     public CountDistinctMetric(String columnName){
-        init("countDist", columnName);
+        this(columnName, false);
+    }
+
+    public CountDistinctMetric(String columnName, boolean isApproximate){
+        init(isApproximate ? APPROX_COUNT_DISTINCT : COUNT_DISTINCT, columnName);
     }
 
     public CountDistinctMetric(StreamExpression expression, StreamFactory factory) throws IOException{
@@ -58,7 +65,7 @@ public class CountDistinctMetric extends Metric {
     }
 
     public Metric newInstance() {
-        return new MeanMetric(columnName, outputLong);
+        return new CountDistinctMetric(columnName);
     }
 
     public String[] getColumns() {
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountMetric.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountMetric.java
index 093b95e..681e3ca 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountMetric.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/metrics/CountMetric.java
@@ -58,6 +58,7 @@ public class CountMetric extends Metric {
   private void init(String functionName, String columnName){
     this.columnName = columnName;
     this.isAllColumns = "*".equals(this.columnName);
+    this.outputLong = true;
     setFunctionName(functionName);
     setIdentifier(functionName, "(", columnName, ")");
   }