You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by en...@apache.org on 2023/04/14 08:13:33 UTC
[doris] branch master updated: [opt](nereids) optimze aggregation estimation #18607
This is an automated email from the ASF dual-hosted git repository.
englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 4174d5a707 [opt](nereids) optimze aggregation estimation #18607
4174d5a707 is described below
commit 4174d5a7077ca540ba0a70d7504d5f973819e2cf
Author: minghong <en...@gmail.com>
AuthorDate: Fri Apr 14 16:13:25 2023 +0800
[opt](nereids) optimze aggregation estimation #18607
`select count(*) from T group by A, B`
suppose `ndv(A) > ndv(B)`
the estimated row count of aggregate is between ndv(A) and ndv(A) * ndv(B)
in previous version, we choose upper bound, that is ndv(A) * ndv(B). The drawback of this choice is the estimated row is often bigger that row count of T.
In this version, we choose the lower bound.
---
.../rules/implementation/AggregateStrategies.java | 25 --------------
.../doris/nereids/stats/StatsCalculator.java | 7 ++--
.../data/nereids_tpchPlanShape_p0/shape/q10.out | 33 +++++++++----------
.../data/nereids_tpchPlanShape_p0/shape/q13.out | 17 +++++-----
.../data/nereids_tpchPlanShape_p0/shape/q18.out | 38 ++++++++++------------
5 files changed, 44 insertions(+), 76 deletions(-)
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java
index aa39ad3467..be914f938e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java
@@ -63,7 +63,6 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalStorageLayerAggrega
import org.apache.doris.nereids.util.ExpressionUtils;
import org.apache.doris.nereids.util.TypeCoercionUtils;
import org.apache.doris.qe.ConnectContext;
-import org.apache.doris.statistics.Statistics;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
@@ -303,20 +302,6 @@ public class AggregateStrategies implements ImplementationRuleFactory {
}
}
- private boolean aggregateOnUniqueColumn(
- LogicalAggregate<? extends Plan> logicalAgg) {
- if (logicalAgg.child() instanceof GroupPlan) {
- Statistics childStats = ((GroupPlan) logicalAgg.child()).getGroup().getStatistics();
- if (childStats != null) {
- return logicalAgg.getGroupByExpressions().stream().anyMatch(
- expression ->
- childStats.almostUniqueExpression(expression)
- );
- }
- }
- return false;
- }
-
/**
* sql: select count(*) from tbl group by id
*
@@ -345,11 +330,6 @@ public class AggregateStrategies implements ImplementationRuleFactory {
*/
private List<PhysicalHashAggregate<Plan>> onePhaseAggregateWithoutDistinct(
LogicalAggregate<? extends Plan> logicalAgg, ConnectContext connectContext) {
- if (!logicalAgg.getGroupByExpressions().isEmpty()
- && !aggregateOnUniqueColumn(logicalAgg)) {
- // twoPhaseAggregate beats onePhaseAggregate
- return null;
- }
RequireProperties requireGather = RequireProperties.of(PhysicalProperties.GATHER);
AggregateParam inputToResultParam = AggregateParam.localResult();
List<NamedExpression> newOutput = ExpressionUtils.rewriteDownShortCircuit(
@@ -776,11 +756,6 @@ public class AggregateStrategies implements ImplementationRuleFactory {
*/
private List<PhysicalHashAggregate<? extends Plan>> twoPhaseAggregateWithDistinct(
LogicalAggregate<? extends Plan> logicalAgg, ConnectContext connectContext) {
- if (!logicalAgg.getGroupByExpressions().isEmpty()
- && !aggregateOnUniqueColumn(logicalAgg)) {
- // threePhaseAggregate beats twoPhaseAggregate
- return null;
- }
Set<AggregateFunction> aggregateFunctions = logicalAgg.getAggregateFunctions();
Set<Expression> distinctArguments = aggregateFunctions.stream()
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index 5be38b5dde..4c53ff0c1d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -479,11 +479,8 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
//all column stats are unknown, use default ratio
resultSetCount = inputRowCount * DEFAULT_AGGREGATE_RATIO;
} else {
- resultSetCount = groupByKeyStats.stream()
- .map(s -> s.ndv)
- .reduce(1.0, (a, b) -> a * b);
- //agg output tuples should be less than input tuples
- resultSetCount = Math.min(resultSetCount, inputRowCount);
+ resultSetCount = groupByKeyStats.stream().map(s -> s.ndv)
+ .max(Double::compare).get();
}
}
}
diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out
index 5cdb232001..b70a354e27 100644
--- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out
+++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out
@@ -4,24 +4,23 @@ PhysicalTopN
--PhysicalDistribute
----PhysicalTopN
------PhysicalProject
---------hashAgg[GLOBAL]
+--------hashAgg[LOCAL]
----------PhysicalDistribute
-------------hashAgg[LOCAL]
---------------PhysicalProject
-----------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
-------------------PhysicalProject
---------------------filter((lineitem.l_returnflag = 'R'))
-----------------------PhysicalOlapScan[lineitem]
-------------------PhysicalDistribute
---------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
-----------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
-------------------------PhysicalProject
---------------------------PhysicalOlapScan[customer]
-------------------------PhysicalDistribute
---------------------------PhysicalProject
-----------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
-------------------------------PhysicalOlapScan[orders]
+------------PhysicalProject
+--------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
+----------------PhysicalProject
+------------------filter((lineitem.l_returnflag = 'R'))
+--------------------PhysicalOlapScan[lineitem]
+----------------PhysicalDistribute
+------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
+--------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
+----------------------PhysicalProject
+------------------------PhysicalOlapScan[customer]
----------------------PhysicalDistribute
------------------------PhysicalProject
---------------------------PhysicalOlapScan[nation]
+--------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
+----------------------------PhysicalOlapScan[orders]
+--------------------PhysicalDistribute
+----------------------PhysicalProject
+------------------------PhysicalOlapScan[nation]
diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out
index 6df8a92b08..d196ba817e 100644
--- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out
+++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out
@@ -7,14 +7,13 @@ PhysicalQuickSort
--------PhysicalDistribute
----------hashAgg[LOCAL]
------------PhysicalProject
---------------hashAgg[GLOBAL]
-----------------hashAgg[LOCAL]
-------------------PhysicalProject
---------------------hashJoin[RIGHT_OUTER_JOIN](customer.c_custkey = orders.o_custkey)
-----------------------PhysicalDistribute
-------------------------PhysicalProject
---------------------------filter(( not (o_comment like '%special%requests%')))
-----------------------------PhysicalOlapScan[orders]
+--------------hashAgg[LOCAL]
+----------------PhysicalProject
+------------------hashJoin[RIGHT_OUTER_JOIN](customer.c_custkey = orders.o_custkey)
+--------------------PhysicalDistribute
----------------------PhysicalProject
-------------------------PhysicalOlapScan[customer]
+------------------------filter(( not (o_comment like '%special%requests%')))
+--------------------------PhysicalOlapScan[orders]
+--------------------PhysicalProject
+----------------------PhysicalOlapScan[customer]
diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out
index 299b4bb581..3e05c3a7c0 100644
--- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out
+++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out
@@ -3,25 +3,23 @@
PhysicalTopN
--PhysicalDistribute
----PhysicalTopN
-------hashAgg[GLOBAL]
---------hashAgg[LOCAL]
-----------PhysicalProject
-------------hashJoin[INNER_JOIN](orders.o_orderkey = lineitem.l_orderkey)
+------hashAgg[LOCAL]
+--------PhysicalProject
+----------hashJoin[INNER_JOIN](orders.o_orderkey = lineitem.l_orderkey)
+------------PhysicalProject
+--------------PhysicalOlapScan[lineitem]
+------------PhysicalDistribute
--------------PhysicalProject
-----------------PhysicalOlapScan[lineitem]
---------------PhysicalDistribute
-----------------PhysicalProject
-------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
---------------------PhysicalProject
-----------------------PhysicalOlapScan[customer]
---------------------PhysicalDistribute
-----------------------hashJoin[LEFT_SEMI_JOIN](orders.o_orderkey = lineitem.l_orderkey)
-------------------------PhysicalProject
---------------------------PhysicalOlapScan[orders]
-------------------------PhysicalProject
---------------------------filter((sum(l_quantity) > 300.000000000))
-----------------------------hashAgg[GLOBAL]
-------------------------------hashAgg[LOCAL]
---------------------------------PhysicalProject
-----------------------------------PhysicalOlapScan[lineitem]
+----------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
+------------------PhysicalProject
+--------------------PhysicalOlapScan[customer]
+------------------PhysicalDistribute
+--------------------hashJoin[LEFT_SEMI_JOIN](orders.o_orderkey = lineitem.l_orderkey)
+----------------------PhysicalProject
+------------------------PhysicalOlapScan[orders]
+----------------------PhysicalProject
+------------------------filter((sum(l_quantity) > 300.000000000))
+--------------------------hashAgg[LOCAL]
+----------------------------PhysicalProject
+------------------------------PhysicalOlapScan[lineitem]
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org