You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by en...@apache.org on 2023/04/14 08:13:33 UTC

[doris] branch master updated: [opt](nereids) optimze aggregation estimation #18607

This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4174d5a707 [opt](nereids) optimze aggregation estimation #18607
4174d5a707 is described below

commit 4174d5a7077ca540ba0a70d7504d5f973819e2cf
Author: minghong <en...@gmail.com>
AuthorDate: Fri Apr 14 16:13:25 2023 +0800

    [opt](nereids) optimze aggregation estimation #18607
    
    `select count(*) from T group by A, B`
    suppose `ndv(A) > ndv(B)`
    the estimated row count of aggregate is between ndv(A) and ndv(A) * ndv(B)
    
    in previous version, we choose upper bound, that is ndv(A) * ndv(B). The drawback of this choice is the estimated row is often bigger that row count of T.
    
    In this version, we choose the lower bound.
---
 .../rules/implementation/AggregateStrategies.java  | 25 --------------
 .../doris/nereids/stats/StatsCalculator.java       |  7 ++--
 .../data/nereids_tpchPlanShape_p0/shape/q10.out    | 33 +++++++++----------
 .../data/nereids_tpchPlanShape_p0/shape/q13.out    | 17 +++++-----
 .../data/nereids_tpchPlanShape_p0/shape/q18.out    | 38 ++++++++++------------
 5 files changed, 44 insertions(+), 76 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java
index aa39ad3467..be914f938e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java
@@ -63,7 +63,6 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalStorageLayerAggrega
 import org.apache.doris.nereids.util.ExpressionUtils;
 import org.apache.doris.nereids.util.TypeCoercionUtils;
 import org.apache.doris.qe.ConnectContext;
-import org.apache.doris.statistics.Statistics;
 
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
@@ -303,20 +302,6 @@ public class AggregateStrategies implements ImplementationRuleFactory {
         }
     }
 
-    private boolean aggregateOnUniqueColumn(
-            LogicalAggregate<? extends Plan> logicalAgg) {
-        if (logicalAgg.child() instanceof GroupPlan) {
-            Statistics childStats = ((GroupPlan) logicalAgg.child()).getGroup().getStatistics();
-            if (childStats != null) {
-                return logicalAgg.getGroupByExpressions().stream().anyMatch(
-                        expression ->
-                            childStats.almostUniqueExpression(expression)
-                );
-            }
-        }
-        return false;
-    }
-
     /**
      * sql: select count(*) from tbl group by id
      *
@@ -345,11 +330,6 @@ public class AggregateStrategies implements ImplementationRuleFactory {
      */
     private List<PhysicalHashAggregate<Plan>> onePhaseAggregateWithoutDistinct(
             LogicalAggregate<? extends Plan> logicalAgg, ConnectContext connectContext) {
-        if (!logicalAgg.getGroupByExpressions().isEmpty()
-                && !aggregateOnUniqueColumn(logicalAgg)) {
-            // twoPhaseAggregate beats onePhaseAggregate
-            return null;
-        }
         RequireProperties requireGather = RequireProperties.of(PhysicalProperties.GATHER);
         AggregateParam inputToResultParam = AggregateParam.localResult();
         List<NamedExpression> newOutput = ExpressionUtils.rewriteDownShortCircuit(
@@ -776,11 +756,6 @@ public class AggregateStrategies implements ImplementationRuleFactory {
      */
     private List<PhysicalHashAggregate<? extends Plan>> twoPhaseAggregateWithDistinct(
             LogicalAggregate<? extends Plan> logicalAgg, ConnectContext connectContext) {
-        if (!logicalAgg.getGroupByExpressions().isEmpty()
-                && !aggregateOnUniqueColumn(logicalAgg)) {
-            // threePhaseAggregate beats twoPhaseAggregate
-            return null;
-        }
         Set<AggregateFunction> aggregateFunctions = logicalAgg.getAggregateFunctions();
 
         Set<Expression> distinctArguments = aggregateFunctions.stream()
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index 5be38b5dde..4c53ff0c1d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -479,11 +479,8 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
                     //all column stats are unknown, use default ratio
                     resultSetCount = inputRowCount * DEFAULT_AGGREGATE_RATIO;
                 } else {
-                    resultSetCount = groupByKeyStats.stream()
-                            .map(s -> s.ndv)
-                            .reduce(1.0, (a, b) -> a * b);
-                    //agg output tuples should be less than input tuples
-                    resultSetCount = Math.min(resultSetCount, inputRowCount);
+                    resultSetCount = groupByKeyStats.stream().map(s -> s.ndv)
+                            .max(Double::compare).get();
                 }
             }
         }
diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out
index 5cdb232001..b70a354e27 100644
--- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out
+++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out
@@ -4,24 +4,23 @@ PhysicalTopN
 --PhysicalDistribute
 ----PhysicalTopN
 ------PhysicalProject
---------hashAgg[GLOBAL]
+--------hashAgg[LOCAL]
 ----------PhysicalDistribute
-------------hashAgg[LOCAL]
---------------PhysicalProject
-----------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
-------------------PhysicalProject
---------------------filter((lineitem.l_returnflag = 'R'))
-----------------------PhysicalOlapScan[lineitem]
-------------------PhysicalDistribute
---------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
-----------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
-------------------------PhysicalProject
---------------------------PhysicalOlapScan[customer]
-------------------------PhysicalDistribute
---------------------------PhysicalProject
-----------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
-------------------------------PhysicalOlapScan[orders]
+------------PhysicalProject
+--------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
+----------------PhysicalProject
+------------------filter((lineitem.l_returnflag = 'R'))
+--------------------PhysicalOlapScan[lineitem]
+----------------PhysicalDistribute
+------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
+--------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
+----------------------PhysicalProject
+------------------------PhysicalOlapScan[customer]
 ----------------------PhysicalDistribute
 ------------------------PhysicalProject
---------------------------PhysicalOlapScan[nation]
+--------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
+----------------------------PhysicalOlapScan[orders]
+--------------------PhysicalDistribute
+----------------------PhysicalProject
+------------------------PhysicalOlapScan[nation]
 
diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out
index 6df8a92b08..d196ba817e 100644
--- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out
+++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out
@@ -7,14 +7,13 @@ PhysicalQuickSort
 --------PhysicalDistribute
 ----------hashAgg[LOCAL]
 ------------PhysicalProject
---------------hashAgg[GLOBAL]
-----------------hashAgg[LOCAL]
-------------------PhysicalProject
---------------------hashJoin[RIGHT_OUTER_JOIN](customer.c_custkey = orders.o_custkey)
-----------------------PhysicalDistribute
-------------------------PhysicalProject
---------------------------filter(( not (o_comment like '%special%requests%')))
-----------------------------PhysicalOlapScan[orders]
+--------------hashAgg[LOCAL]
+----------------PhysicalProject
+------------------hashJoin[RIGHT_OUTER_JOIN](customer.c_custkey = orders.o_custkey)
+--------------------PhysicalDistribute
 ----------------------PhysicalProject
-------------------------PhysicalOlapScan[customer]
+------------------------filter(( not (o_comment like '%special%requests%')))
+--------------------------PhysicalOlapScan[orders]
+--------------------PhysicalProject
+----------------------PhysicalOlapScan[customer]
 
diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out
index 299b4bb581..3e05c3a7c0 100644
--- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out
+++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out
@@ -3,25 +3,23 @@
 PhysicalTopN
 --PhysicalDistribute
 ----PhysicalTopN
-------hashAgg[GLOBAL]
---------hashAgg[LOCAL]
-----------PhysicalProject
-------------hashJoin[INNER_JOIN](orders.o_orderkey = lineitem.l_orderkey)
+------hashAgg[LOCAL]
+--------PhysicalProject
+----------hashJoin[INNER_JOIN](orders.o_orderkey = lineitem.l_orderkey)
+------------PhysicalProject
+--------------PhysicalOlapScan[lineitem]
+------------PhysicalDistribute
 --------------PhysicalProject
-----------------PhysicalOlapScan[lineitem]
---------------PhysicalDistribute
-----------------PhysicalProject
-------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
---------------------PhysicalProject
-----------------------PhysicalOlapScan[customer]
---------------------PhysicalDistribute
-----------------------hashJoin[LEFT_SEMI_JOIN](orders.o_orderkey = lineitem.l_orderkey)
-------------------------PhysicalProject
---------------------------PhysicalOlapScan[orders]
-------------------------PhysicalProject
---------------------------filter((sum(l_quantity) > 300.000000000))
-----------------------------hashAgg[GLOBAL]
-------------------------------hashAgg[LOCAL]
---------------------------------PhysicalProject
-----------------------------------PhysicalOlapScan[lineitem]
+----------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
+------------------PhysicalProject
+--------------------PhysicalOlapScan[customer]
+------------------PhysicalDistribute
+--------------------hashJoin[LEFT_SEMI_JOIN](orders.o_orderkey = lineitem.l_orderkey)
+----------------------PhysicalProject
+------------------------PhysicalOlapScan[orders]
+----------------------PhysicalProject
+------------------------filter((sum(l_quantity) > 300.000000000))
+--------------------------hashAgg[LOCAL]
+----------------------------PhysicalProject
+------------------------------PhysicalOlapScan[lineitem]
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org