You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@druid.apache.org by ab...@apache.org on 2022/02/16 15:24:03 UTC

[druid] branch master updated: Adding new config for disabling group by on multiValue column (#12253)

This is an automated email from the ASF dual-hosted git repository.

abhishek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/druid.git


The following commit(s) were added to refs/heads/master by this push:
     new 5794331  Adding new config for disabling group by on multiValue column (#12253)
5794331 is described below

commit 5794331eb132b16ed6bffe34c38be12a9cf4bcec
Author: Karan Kumar <ka...@gmail.com>
AuthorDate: Wed Feb 16 20:53:26 2022 +0530

    Adding new config for disabling group by on multiValue column (#12253)
    
    As part of #12078 one of the followup's was to have a specific config which does not allow accidental unnesting of multi value columns if such columns become part of the grouping key.
    Added a config groupByEnableMultiValueUnnesting which can be set in the query context.
    
    The default value of groupByEnableMultiValueUnnesting is true, therefore it does not change the current engine behavior.
    If groupByEnableMultiValueUnnesting is set to false, the query will fail if it encounters a multi-value column in the grouping key.
---
 docs/querying/groupbyquery.md                      |  1 +
 docs/querying/multi-value-dimensions.md            |  7 +++++
 .../druid/query/groupby/GroupByQueryConfig.java    | 14 +++++++++
 .../druid/query/groupby/GroupByQueryEngine.java    |  9 +++++-
 .../epinephelinae/GroupByQueryEngineV2.java        | 29 ++++++++++++++++-
 .../query/groupby/GroupByQueryRunnerTest.java      | 36 ++++++++++++++++++++++
 .../calcite/CalciteMultiValueStringQueryTest.java  | 32 ++++++++++++++++++-
 website/.spelling                                  |  4 +++
 8 files changed, 129 insertions(+), 3 deletions(-)

diff --git a/docs/querying/groupbyquery.md b/docs/querying/groupbyquery.md
index b20f24b..5502387 100644
--- a/docs/querying/groupbyquery.md
+++ b/docs/querying/groupbyquery.md
@@ -436,6 +436,7 @@ Supported query contexts:
 |`sortByDimsFirst`|Sort the results first by dimension values and then by timestamp.|false|
 |`forceLimitPushDown`|When all fields in the orderby are part of the grouping key, the Broker will push limit application down to the Historical processes. When the sorting order uses fields that are not in the grouping key, applying this optimization can result in approximate results with unknown accuracy, so this optimization is disabled by default in that case. Enabling this context flag turns on limit push down for limit/orderbys that contain non-grouping key columns.|false|
 |`applyLimitPushDownToSegment`|If Broker pushes limit down to queryable nodes (historicals, peons) then limit results during segment scan. This context value can be used to override `druid.query.groupBy.applyLimitPushDownToSegment`.|true|
+|`groupByEnableMultiValueUnnesting`|Safety flag to enable/disable the implicit unnesting on multi value column's as part of the grouping key. 'true' indicates multi-value grouping keys are unnested. 'false' returns an error if a multi value column is found as part of the grouping key.|true|
 
 
 #### GroupBy v1 configurations
diff --git a/docs/querying/multi-value-dimensions.md b/docs/querying/multi-value-dimensions.md
index 952f21a..36529a4 100644
--- a/docs/querying/multi-value-dimensions.md
+++ b/docs/querying/multi-value-dimensions.md
@@ -375,3 +375,10 @@ This query returns the following result:
 Note that, for groupBy queries, you could get similar result with a [having spec](having.md) but using a filtered
 `dimensionSpec` is much more efficient because that gets applied at the lowest level in the query processing pipeline.
 Having specs are applied at the outermost level of groupBy query processing.
+
+## Disable GroupBy on multi-value columns
+
+You can disable the implicit unnesting behavior for groupBy by setting groupByEnableMultiValueUnnesting: false in your 
+query context. In this mode, the groupBy engine will return an error instead of completing the query. This is a safety 
+feature for situations where you believe that all dimensions are singly-valued and want the engine to reject any 
+multi-valued dimensions that were inadvertently included.
\ No newline at end of file
diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java
index 7bf2d0d..5e32e88 100644
--- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java
+++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryConfig.java
@@ -34,6 +34,7 @@ public class GroupByQueryConfig
   public static final String CTX_KEY_FORCE_PUSH_DOWN_NESTED_QUERY = "forcePushDownNestedQuery";
   public static final String CTX_KEY_EXECUTING_NESTED_QUERY = "executingNestedQuery";
   public static final String CTX_KEY_ARRAY_RESULT_ROWS = "resultAsArray";
+  public static final String CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING = "groupByEnableMultiValueUnnesting";
   private static final String CTX_KEY_IS_SINGLE_THREADED = "groupByIsSingleThreaded";
   private static final String CTX_KEY_MAX_INTERMEDIATE_ROWS = "maxIntermediateRows";
   private static final String CTX_KEY_MAX_RESULTS = "maxResults";
@@ -97,6 +98,9 @@ public class GroupByQueryConfig
   @JsonProperty
   private boolean vectorize = true;
 
+  @JsonProperty
+  private boolean enableMultiValueUnnesting = true;
+
   public String getDefaultStrategy()
   {
     return defaultStrategy;
@@ -192,6 +196,11 @@ public class GroupByQueryConfig
     return forcePushDownNestedQuery;
   }
 
+  public boolean isMultiValueUnnestingEnabled()
+  {
+    return enableMultiValueUnnesting;
+  }
+
   public GroupByQueryConfig withOverrides(final GroupByQuery query)
   {
     final GroupByQueryConfig newConfig = new GroupByQueryConfig();
@@ -244,6 +253,10 @@ public class GroupByQueryConfig
         getNumParallelCombineThreads()
     );
     newConfig.vectorize = query.getContextBoolean(QueryContexts.VECTORIZE_KEY, isVectorize());
+    newConfig.enableMultiValueUnnesting = query.getContextBoolean(
+        CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
+        isMultiValueUnnestingEnabled()
+    );
     return newConfig;
   }
 
@@ -266,6 +279,7 @@ public class GroupByQueryConfig
            ", numParallelCombineThreads=" + numParallelCombineThreads +
            ", vectorize=" + vectorize +
            ", forcePushDownNestedQuery=" + forcePushDownNestedQuery +
+           ", enableMultiValueUnnesting=" + enableMultiValueUnnesting +
            '}';
   }
 }
diff --git a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java
index d3e072e..62a82dc 100644
--- a/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java
+++ b/processing/src/main/java/org/apache/druid/query/groupby/GroupByQueryEngine.java
@@ -32,6 +32,7 @@ import org.apache.druid.data.input.Row;
 import org.apache.druid.guice.annotations.Global;
 import org.apache.druid.java.util.common.IAE;
 import org.apache.druid.java.util.common.ISE;
+import org.apache.druid.java.util.common.UOE;
 import org.apache.druid.java.util.common.guava.BaseSequence;
 import org.apache.druid.java.util.common.guava.FunctionalIterator;
 import org.apache.druid.java.util.common.guava.Sequence;
@@ -91,7 +92,13 @@ public class GroupByQueryEngine
           "Null storage adapter found. Probably trying to issue a query against a segment being memory unmapped."
       );
     }
-
+    if (!query.getContextValue(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true)) {
+      throw new UOE(
+          "GroupBy v1 does not support %s as false. Set %s to true or use groupBy v2",
+          GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
+          GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING
+      );
+    }
     final List<Interval> intervals = query.getQuerySegmentSpec().getIntervals();
     if (intervals.size() != 1) {
       throw new IAE("Should only have one interval, got[%s]", intervals);
diff --git a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java
index 5a5263e..f38f6ce 100644
--- a/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java
+++ b/processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/GroupByQueryEngineV2.java
@@ -459,6 +459,8 @@ public class GroupByQueryEngineV2
     @Nullable
     protected CloseableGrouperIterator<KeyType, ResultRow> delegate = null;
     protected final boolean allSingleValueDims;
+    protected final boolean allowMultiValueGrouping;
+
 
     public GroupByEngineIterator(
         final GroupByQuery query,
@@ -480,6 +482,10 @@ public class GroupByQueryEngineV2
       // Time is the same for every row in the cursor
       this.timestamp = fudgeTimestamp != null ? fudgeTimestamp : cursor.getTime();
       this.allSingleValueDims = allSingleValueDims;
+      this.allowMultiValueGrouping = query.getContextBoolean(
+          GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING,
+          true
+      );
     }
 
     private CloseableGrouperIterator<KeyType, ResultRow> initNewDelegate()
@@ -593,6 +599,19 @@ public class GroupByQueryEngineV2
       return indexedInts.size() == 1 ? indexedInts.get(0) : GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE;
     }
 
+    protected void checkIfMultiValueGroupingIsAllowed(String dimName)
+    {
+      if (!allowMultiValueGrouping) {
+        throw new ISE(
+            "Encountered multi-value dimension %s that cannot be processed with %s set to false."
+            + " Consider setting %s to true.",
+            dimName,
+            GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
+            GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
+        );
+      }
+    }
+
   }
 
   private static class HashAggregateIterator extends GroupByEngineIterator<ByteBuffer>
@@ -764,6 +783,9 @@ public class GroupByQueryEngineV2
             );
 
             if (doAggregate) {
+              // this check is done during the row aggregation as a dimension can become multi-value col if column
+              // capabilities is unknown.
+              checkIfMultiValueGroupingIsAllowed(dims[stackPointer].getName());
               stack[stackPointer]++;
               for (int i = stackPointer + 1; i < stack.length; i++) {
                 dims[i].getColumnSelectorStrategy().initGroupingKeyColumnValue(
@@ -889,12 +911,17 @@ public class GroupByQueryEngineV2
       }
 
       while (!cursor.isDone()) {
-        int multiValuesSize = multiValues.size();
+        final int multiValuesSize = multiValues.size();
         if (multiValuesSize == 0) {
           if (!grouper.aggregate(GroupByColumnSelectorStrategy.GROUP_BY_MISSING_VALUE).isOk()) {
             return;
           }
         } else {
+          if (multiValuesSize > 1) {
+            // this check is done during the row aggregation as a dimension can become multi-value col if column
+            // capabilities is unknown.
+            checkIfMultiValueGroupingIsAllowed(dim.getName());
+          }
           for (; nextValIndex < multiValuesSize; nextValIndex++) {
             if (!grouper.aggregate(multiValues.get(nextValIndex)).isOk()) {
               return;
diff --git a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java
index 5b89eb3..4b958d3 100644
--- a/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java
+++ b/processing/src/test/java/org/apache/druid/query/groupby/GroupByQueryRunnerTest.java
@@ -40,6 +40,7 @@ import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.Pair;
 import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.java.util.common.UOE;
 import org.apache.druid.java.util.common.concurrent.Execs;
 import org.apache.druid.java.util.common.granularity.DurationGranularity;
 import org.apache.druid.java.util.common.granularity.Granularities;
@@ -1311,6 +1312,41 @@ public class GroupByQueryRunnerTest extends InitializedNullHandlingTest
   }
 
   @Test
+  public void testMultiValueDimensionNotAllowed()
+  {
+
+    if (config.getDefaultStrategy().equals(GroupByStrategySelector.STRATEGY_V1)) {
+      expectedException.expect(UOE.class);
+      expectedException.expectMessage(StringUtils.format(
+          "GroupBy v1 does not support %s as false",
+          GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING
+      ));
+    } else if (!vectorize) {
+      expectedException.expect(RuntimeException.class);
+      expectedException.expectMessage(StringUtils.format(
+          "Encountered multi-value dimension %s that cannot be processed with %s set to false."
+          + " Consider setting %s to true.",
+          "placementish",
+          GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
+          GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
+      ));
+    } else {
+      cannotVectorize();
+    }
+
+    GroupByQuery query = makeQueryBuilder()
+        .setDataSource(QueryRunnerTestHelper.DATA_SOURCE)
+        .setQuerySegmentSpec(QueryRunnerTestHelper.FIRST_TO_THIRD)
+        .setDimensions(new DefaultDimensionSpec("placementish", "alias"))
+        .setAggregatorSpecs(QueryRunnerTestHelper.ROWS_COUNT, new LongSumAggregatorFactory("idx", "index"))
+        .setGranularity(QueryRunnerTestHelper.ALL_GRAN)
+        .overrideContext(ImmutableMap.of(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false))
+        .build();
+
+    GroupByQueryRunnerTestHelper.runQuery(factory, runner, query);
+  }
+
+  @Test
   public void testMultiValueDimensionAsArray()
   {
     // array types don't work with group by v1
diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java
index e720333..b28f418 100644
--- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteMultiValueStringQueryTest.java
@@ -22,6 +22,7 @@ package org.apache.druid.sql.calcite;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import org.apache.druid.common.config.NullHandling;
+import org.apache.druid.java.util.common.StringUtils;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.math.expr.ExpressionProcessing;
 import org.apache.druid.query.Druids;
@@ -32,6 +33,7 @@ import org.apache.druid.query.filter.InDimFilter;
 import org.apache.druid.query.filter.LikeDimFilter;
 import org.apache.druid.query.filter.SelectorDimFilter;
 import org.apache.druid.query.groupby.GroupByQuery;
+import org.apache.druid.query.groupby.GroupByQueryConfig;
 import org.apache.druid.query.groupby.orderby.DefaultLimitSpec;
 import org.apache.druid.query.groupby.orderby.OrderByColumnSpec;
 import org.apache.druid.query.ordering.StringComparators;
@@ -43,7 +45,9 @@ import org.apache.druid.sql.calcite.filtration.Filtration;
 import org.apache.druid.sql.calcite.util.CalciteTests;
 import org.junit.Test;
 
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
 {
@@ -53,7 +57,8 @@ public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
   {
     // Cannot vectorize due to usage of expressions.
     cannotVectorize();
-
+    Map<String, Object> groupByOnMultiValueColumnEnabled = new HashMap<>(QUERY_CONTEXT_DEFAULT);
+    groupByOnMultiValueColumnEnabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, true);
     List<Object[]> expected;
     if (NullHandling.replaceWithDefault()) {
       expected = ImmutableList.of(
@@ -76,6 +81,7 @@ public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
     }
     testQuery(
         "SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC",
+        groupByOnMultiValueColumnEnabled,
         ImmutableList.of(
             GroupByQuery.builder()
                         .setDataSource(CalciteTests.DATASOURCE3)
@@ -104,6 +110,30 @@ public class CalciteMultiValueStringQueryTest extends BaseCalciteQueryTest
   }
 
   @Test
+  public void testMultiValueStringGroupByDoesNotWork() throws Exception
+  {
+    // Cannot vectorize due to usage of expressions.
+    cannotVectorize();
+    Map<String, Object> groupByOnMultiValueColumnDisabled = new HashMap<>(QUERY_CONTEXT_DEFAULT);
+    groupByOnMultiValueColumnDisabled.put(GroupByQueryConfig.CTX_KEY_ENABLE_MULTI_VALUE_UNNESTING, false);
+    testQueryThrows(
+        "SELECT concat(dim3, 'foo'), SUM(cnt) FROM druid.numfoo GROUP BY 1 ORDER BY 2 DESC",
+        groupByOnMultiValueColumnDisabled,
+        ImmutableList.of(),
+        exception -> {
+          exception.expect(RuntimeException.class);
+          expectedException.expectMessage(StringUtils.format(
+              "Encountered multi-value dimension %s that cannot be processed with %s set to false."
+              + " Consider setting %s to true.",
+              "v0",
+              GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY,
+              GroupByQueryConfig.CTX_KEY_EXECUTING_NESTED_QUERY
+          ));
+        }
+    );
+  }
+
+  @Test
   public void testMultiValueStringWorksLikeStringGroupByWithFilter() throws Exception
   {
     // Cannot vectorize due to usage of expressions.
diff --git a/website/.spelling b/website/.spelling
index 8c44966..f67272f 100644
--- a/website/.spelling
+++ b/website/.spelling
@@ -1565,6 +1565,8 @@ outputName
 pushdown
 row1
 subtotalsSpec
+unnested
+unnesting
  - ../docs/querying/having.md
 HavingSpec
 HavingSpecs
@@ -1595,6 +1597,8 @@ row4
 t3
 t4
 t5
+groupByEnableMultiValueUnnesting
+unnesting
  - ../docs/querying/multitenancy.md
 500ms
 tenant_id

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@druid.apache.org
For additional commands, e-mail: commits-help@druid.apache.org