You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/12/19 16:40:04 UTC

[doris] branch master updated: [fix](nereids) stats calculator lost column statistics on limit node (#14759)

This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new a086f67255 [fix](nereids) stats calculator lost column statistics on limit node (#14759)
a086f67255 is described below

commit a086f67255f4d6027a40d66b9f62fbb37c31a008
Author: minghong <mi...@163.com>
AuthorDate: Tue Dec 20 00:39:57 2022 +0800

    [fix](nereids) stats calculator lost column statistics on limit node (#14759)
    
    `select avg(id) from (select id from t1 limit 1);`
    above sql encounters NPE, because stats for limit node lost column statistics
---
 .../doris/nereids/stats/StatsCalculator.java       |  6 +--
 .../apache/doris/statistics/ColumnStatistic.java   | 29 +++++++----
 .../apache/doris/statistics/StatsDeriveResult.java | 20 +++-----
 .../doris/statistics/StatsDeriveResultTest.java    | 57 ++++++++++++++++++++++
 4 files changed, 86 insertions(+), 26 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index 2e8cd7abff..31975280e1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -276,7 +276,7 @@ public class StatsCalculator extends DefaultPlanVisitor<StatsDeriveResult, Void>
 
     private StatsDeriveResult computeAssertNumRows(long desiredNumOfRows) {
         StatsDeriveResult statsDeriveResult = groupExpression.childStatistics(0);
-        statsDeriveResult.updateRowCountByLimit(1);
+        statsDeriveResult.updateByLimit(1);
         return statsDeriveResult;
     }
 
@@ -315,12 +315,12 @@ public class StatsCalculator extends DefaultPlanVisitor<StatsDeriveResult, Void>
 
     private StatsDeriveResult computeTopN(TopN topN) {
         StatsDeriveResult stats = groupExpression.childStatistics(0);
-        return stats.updateRowCountByLimit(topN.getLimit());
+        return stats.updateByLimit(topN.getLimit());
     }
 
     private StatsDeriveResult computeLimit(Limit limit) {
         StatsDeriveResult stats = groupExpression.childStatistics(0);
-        return stats.updateRowCountByLimit(limit.getLimit());
+        return stats.updateByLimit(limit.getLimit());
     }
 
     private StatsDeriveResult computeAggregate(Aggregate aggregate) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
index 4865767496..a0530a7f59 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
@@ -90,7 +90,7 @@ public class ColumnStatistic {
     public ColumnStatistic(double count, double ndv, double avgSizeByte,
             double numNulls, double dataSize, double minValue, double maxValue,
             double selectivity, LiteralExpr minExpr,
-            LiteralExpr maxExpr, boolean isNaN) {
+            LiteralExpr maxExpr, boolean isUnKnown) {
         this.count = count;
         this.ndv = ndv;
         this.avgSizeByte = avgSizeByte;
@@ -101,7 +101,7 @@ public class ColumnStatistic {
         this.selectivity = selectivity;
         this.minExpr = minExpr;
         this.maxExpr = maxExpr;
-        this.isUnKnown = isNaN;
+        this.isUnKnown = isUnKnown;
     }
 
     // TODO: use thrift
@@ -158,18 +158,29 @@ public class ColumnStatistic {
                 .setSelectivity(selectivity).setIsUnknown(isUnKnown).build();
     }
 
-    public ColumnStatistic multiply(double d) {
+    public ColumnStatistic updateByLimit(long limit, double rowCount) {
+        double ratio = 0;
+        if (rowCount != 0) {
+            ratio = limit / rowCount;
+        }
+        double newNdv = Math.ceil(Math.min(ndv, limit));
+        double newSelectivity = selectivity;
+        if (newNdv != 0) {
+            newSelectivity = newSelectivity * newNdv / ndv;
+        } else {
+            newSelectivity = 0;
+        }
         return new ColumnStatisticBuilder()
-                .setCount(Math.ceil(count * d))
-                .setNdv(Math.ceil(ndv * d))
-                .setAvgSizeByte(Math.ceil(avgSizeByte * d))
-                .setNumNulls(Math.ceil(numNulls * d))
-                .setDataSize(Math.ceil(dataSize * d))
+                .setCount(Math.ceil(limit))
+                .setNdv(newNdv)
+                .setAvgSizeByte(Math.ceil(avgSizeByte))
+                .setNumNulls(Math.ceil(numNulls * ratio))
+                .setDataSize(Math.ceil(dataSize * ratio))
                 .setMinValue(minValue)
                 .setMaxValue(maxValue)
                 .setMinExpr(minExpr)
                 .setMaxExpr(maxExpr)
-                .setSelectivity(selectivity)
+                .setSelectivity(newSelectivity)
                 .setIsUnknown(isUnKnown)
                 .build();
     }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java
index 84c21c7b9d..f6a124b81d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java
@@ -20,6 +20,8 @@ package org.apache.doris.statistics;
 import org.apache.doris.common.Id;
 import org.apache.doris.nereids.trees.expressions.Slot;
 
+import com.google.common.base.Preconditions;
+
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -122,14 +124,12 @@ public class StatsDeriveResult {
         return statsDeriveResult;
     }
 
-    public StatsDeriveResult updateRowCountByLimit(long limit) {
+    public StatsDeriveResult updateByLimit(long limit) {
+        Preconditions.checkArgument(limit >= 0);
+        limit = Math.min(limit, (long) rowCount);
         StatsDeriveResult statsDeriveResult = new StatsDeriveResult(limit, width, penalty);
-        double selectivity = 1.0;
-        if (limit > 0 && rowCount > 0 && rowCount > limit) {
-            selectivity = ((double) limit) / rowCount;
-        }
         for (Entry<Id, ColumnStatistic> entry : slotIdToColumnStats.entrySet()) {
-            statsDeriveResult.addColumnStats(entry.getKey(), entry.getValue().multiply(selectivity));
+            statsDeriveResult.addColumnStats(entry.getKey(), entry.getValue().updateByLimit(limit, rowCount));
         }
         return statsDeriveResult;
     }
@@ -162,14 +162,6 @@ public class StatsDeriveResult {
         }
     }
 
-    public StatsDeriveResult updateRowCountOnCopy(double selectivity) {
-        StatsDeriveResult copy = new StatsDeriveResult(rowCount * selectivity, width, penalty);
-        for (Entry<Id, ColumnStatistic> entry : slotIdToColumnStats.entrySet()) {
-            copy.addColumnStats(entry.getKey(), entry.getValue().multiply(selectivity));
-        }
-        return copy;
-    }
-
     public StatsDeriveResult updateRowCount(double rowCount) {
         return new StatsDeriveResult(rowCount, width, penalty, slotIdToColumnStats);
     }
diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java
new file mode 100644
index 0000000000..0f2cebf511
--- /dev/null
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.statistics;
+
+import org.apache.doris.common.Id;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class StatsDeriveResultTest {
+    @Test
+    public void testUpdateRowCountByLimit() {
+        StatsDeriveResult stats = new StatsDeriveResult(100);
+        ColumnStatistic a = new ColumnStatistic(100, 10, 1, 5, 10,
+                1, 100, 0.5, null, null, false);
+        Id id = new Id(1);
+        stats.addColumnStats(id, a);
+        StatsDeriveResult res = stats.updateByLimit(0);
+        Assertions.assertEquals(0, res.getRowCount());
+        Assertions.assertEquals(1, res.getSlotIdToColumnStats().size());
+        ColumnStatistic resColStats = res.getColumnStatsBySlotId(id);
+        Assertions.assertEquals(0, resColStats.ndv);
+        Assertions.assertEquals(1, resColStats.avgSizeByte);
+        Assertions.assertEquals(0, resColStats.numNulls);
+        Assertions.assertEquals(0, resColStats.dataSize);
+        Assertions.assertEquals(1, resColStats.minValue);
+        Assertions.assertEquals(100, resColStats.maxValue);
+        Assertions.assertEquals(0, resColStats.selectivity);
+        Assertions.assertEquals(false, resColStats.isUnKnown);
+
+        res = stats.updateByLimit(1);
+        resColStats = res.getColumnStatsBySlotId(id);
+        Assertions.assertEquals(1, resColStats.ndv);
+        Assertions.assertEquals(1, resColStats.avgSizeByte);
+        Assertions.assertEquals(1, resColStats.numNulls);
+        Assertions.assertEquals(1, resColStats.dataSize);
+        Assertions.assertEquals(1, resColStats.minValue);
+        Assertions.assertEquals(100, resColStats.maxValue);
+        Assertions.assertEquals(0.05, resColStats.selectivity);
+        Assertions.assertEquals(false, resColStats.isUnKnown);
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org