You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by en...@apache.org on 2023/04/04 06:20:41 UTC

[doris] branch master updated: [improve](nereids)compute statsRange.length() according to the column datatype (#18331)

This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 3fc8c19735 [improve](nereids)compute statsRange.length() according to the column datatype (#18331)
3fc8c19735 is described below

commit 3fc8c19735e55bee1e475124d42b155a1dea1d8e
Author: minghong <en...@gmail.com>
AuthorDate: Tue Apr 4 14:20:34 2023 +0800

    [improve](nereids)compute statsRange.length() according to the column datatype (#18331)
    
    we map date/datetime/V2 to double. this map reserves date order, but it does not reserve range length.
    For example, from 1990-01-01 to 1991-01-01, there are 12 months. for filter `A < 1990-02-01`, the selectivity
    should be `1/12`.
    
    if we compute this filter by their corresponding double value,
    `sel = (19900201 - 19900101) / (19910101 - 19900101) = 100/10000 = 1/100`
    
    the error is about 10 times.
    This pr aims to fix this error.
    Describe your changes.
    
    Solution:
    convert double to its corresponding dataType(date/datev2), then compute the range length with respect to its datatype.
---
 .../doris/nereids/stats/FilterEstimation.java      | 19 +++++++------
 .../doris/nereids/stats/StatsCalculator.java       |  9 +++---
 .../trees/expressions/literal/DateLiteral.java     |  5 ++++
 .../trees/expressions/literal/DateTimeLiteral.java |  5 ++++
 .../org/apache/doris/nereids/types/DataType.java   |  4 +++
 .../doris/nereids/types/coercion/DateLikeType.java | 22 +++++++++++++++
 .../apache/doris/statistics/StatisticRange.java    | 33 +++++++++++++---------
 .../doris/nereids/stats/FilterEstimationTest.java  | 24 ++++++++++++++++
 8 files changed, 94 insertions(+), 27 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index 7cebe8f81d..0fd9f1e9b1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -176,7 +176,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
             return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains);
         }
         //rightRange.distinctValues should not be used
-        StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv);
+        StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv,
+                leftExpr.getDataType());
         return estimateBinaryComparisonFilter(leftExpr,
                 statsForLeft,
                 rightRange, context);
@@ -189,7 +190,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
         }
         //rightRange.distinctValues should not be used
         StatisticRange rightRange = new StatisticRange(val, statsForLeft.maxValue,
-                statsForLeft.ndv);
+                statsForLeft.ndv, leftExpr.getDataType());
         return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context);
     }
 
@@ -360,7 +361,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
     private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats,
             StatisticRange rightRange, EstimationContext context) {
         StatisticRange leftRange =
-                new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv);
+                new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv, leftExpr.getDataType());
         StatisticRange intersectRange = leftRange.cover(rightRange);
         ColumnStatisticBuilder leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
                 .setMinValue(intersectRange.getLow())
@@ -375,8 +376,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
 
     private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatistic leftStats,
             Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
-        StatisticRange leftRange = StatisticRange.from(leftStats);
-        StatisticRange rightRange = StatisticRange.from(rightStats);
+        StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
+        StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
         StatisticRange leftIntersectRight = leftRange.intersect(rightRange);
         StatisticRange rightIntersectLeft = rightRange.intersect(leftIntersectRight);
         ColumnStatisticBuilder leftBuilder = new ColumnStatisticBuilder(leftStats);
@@ -396,8 +397,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
 
     private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStatistic leftStats,
             Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
-        StatisticRange leftRange = StatisticRange.from(leftStats);
-        StatisticRange rightRange = StatisticRange.from(rightStats);
+        StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
+        StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
         Statistics statistics = null;
         // Left always less than Right
         if (leftRange.getHigh() < rightRange.getLow()) {
@@ -414,7 +415,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
             return context.statistics.withRowCount(0.0);
         }
         StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue,
-                rightStats.minValue, Double.NaN);
+                rightStats.minValue, Double.NaN, leftExpr.getDataType());
         double leftAlwaysLessThanRightPercent = 0;
         if (leftRange.getLow() < rightRange.getLow()) {
             leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange);
@@ -429,7 +430,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
         double rightAlwaysGreaterRangeFraction = 0;
         if (leftRange.getHigh() < rightRange.getHigh()) {
             rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange(leftRange.getHigh(),
-                    rightRange.getHigh(), Double.NaN));
+                    rightRange.getHigh(), Double.NaN, rightExpr.getDataType()));
         }
         ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats)
                 .setMinValue(Math.max(leftRange.getLow(), rightRange.getLow()))
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index 6bae008b24..942c19a6ac 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -89,6 +89,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalTopN;
 import org.apache.doris.nereids.trees.plans.physical.PhysicalUnion;
 import org.apache.doris.nereids.trees.plans.physical.PhysicalWindow;
 import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanVisitor;
+import org.apache.doris.nereids.types.DataType;
 import org.apache.doris.statistics.ColumnLevelStatisticCache;
 import org.apache.doris.statistics.ColumnStatistic;
 import org.apache.doris.statistics.ColumnStatisticBuilder;
@@ -577,7 +578,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
                 double rightRowCount = childStats.get(j).getRowCount();
                 ColumnStatistic estimatedColumnStatistics
                         = unionColumn(headStats.findColumnStatistics(headSlot),
-                        headStats.getRowCount(), rightStatistic, rightRowCount);
+                        headStats.getRowCount(), rightStatistic, rightRowCount, headSlot.getDataType());
                 headStats.addColumnStats(headSlot, estimatedColumnStatistics);
                 leftRowCount += childStats.get(j).getRowCount();
             }
@@ -692,12 +693,12 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
     }
 
     private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats,
-            double rightRowCount) {
+            double rightRowCount, DataType dataType) {
         ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
         columnStatisticBuilder.setMaxValue(Math.max(leftStats.maxValue, rightStats.maxValue));
         columnStatisticBuilder.setMinValue(Math.min(leftStats.minValue, rightStats.minValue));
-        StatisticRange leftRange = StatisticRange.from(leftStats);
-        StatisticRange rightRange = StatisticRange.from(rightStats);
+        StatisticRange leftRange = StatisticRange.from(leftStats, dataType);
+        StatisticRange rightRange = StatisticRange.from(rightStats, dataType);
         StatisticRange newRange = leftRange.union(rightRange);
         double newRowCount = leftRowCount + rightRowCount;
         double leftSize = (leftRowCount - leftStats.numNulls) * leftStats.avgSizeByte;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java
index f401d73b0c..da3dd4dfc3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java
@@ -148,6 +148,11 @@ public class DateLiteral extends Literal {
         return (year * 10000 + month * 100 + day) * 1000000L;
     }
 
+    @Override
+    public double getDouble() {
+        return (double) getValue();
+    }
+
     @Override
     public String getStringValue() {
         return String.format("%04d-%02d-%02d", year, month, day);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java
index a0a9abbd81..c97e1612e3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java
@@ -247,6 +247,11 @@ public class DateTimeLiteral extends DateLiteral {
         return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second;
     }
 
+    @Override
+    public double getDouble() {
+        return (double) getValue();
+    }
+
     @Override
     public String toSql() {
         return toString();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
index f14c9e0e43..5e6706e9b0 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
@@ -587,4 +587,8 @@ public abstract class DataType implements AbstractDataType {
             return (Map) ImmutableMap.copyOf(promotionMap);
         }
     }
+
+    public double rangeLength(double high, double low) {
+        return high - low;
+    }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
index ee491fa18d..7defd1b61f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
@@ -17,8 +17,30 @@
 
 package org.apache.doris.nereids.types.coercion;
 
+import java.time.temporal.ChronoUnit;
+import java.util.Calendar;
+
 /**
  * date like type.
  */
 public abstract class DateLikeType extends PrimitiveType {
+    private Calendar toCalendar(double d) {
+        //d = (year * 10000 + month * 100 + day) * 1000000L;
+        int date = (int) (d / 1000000);
+        int day = date % 100;
+        int month = (date / 100) % 100;
+        int year = date / 10000;
+        Calendar calendar = Calendar.getInstance();
+        calendar.set(Calendar.YEAR, year);
+        calendar.set(Calendar.MONDAY, month);
+        calendar.set(Calendar.DAY_OF_MONTH, day);
+        return calendar;
+    }
+
+    @Override
+    public double rangeLength(double high, double low) {
+        Calendar to = toCalendar(high);
+        Calendar from = toCalendar(low);
+        return ChronoUnit.DAYS.between(from.toInstant(), to.toInstant());
+    }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java
index bbc95169e9..74055e62f1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java
@@ -17,6 +17,8 @@
 
 package org.apache.doris.statistics;
 
+import org.apache.doris.nereids.types.DataType;
+
 import java.util.Objects;
 
 public class StatisticRange {
@@ -34,10 +36,13 @@ public class StatisticRange {
 
     private final double distinctValues;
 
-    public StatisticRange(double low, double high, double distinctValues) {
+    private final DataType dataType;
+
+    public StatisticRange(double low, double high, double distinctValues, DataType dataType) {
         this.low = low;
         this.high = high;
         this.distinctValues = distinctValues;
+        this.dataType = dataType;
     }
 
     public double overlapPercentWith(StatisticRange other) {
@@ -50,7 +55,7 @@ public class StatisticRange {
             return 1.0;
         }
 
-        double lengthOfIntersect = Math.min(this.high, other.high) - Math.max(this.low, other.low);
+        double lengthOfIntersect = dataType.rangeLength(Math.min(this.high, other.high), Math.max(this.low, other.low));
         if (Double.isInfinite(lengthOfIntersect)) {
             if (Double.isFinite(this.distinctValues) && Double.isFinite(other.distinctValues)) {
                 return Math.min(other.distinctValues / this.distinctValues, 1);
@@ -73,8 +78,8 @@ public class StatisticRange {
         return INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
     }
 
-    public static StatisticRange empty() {
-        return new StatisticRange(Double.NaN, Double.NaN, 0);
+    public static StatisticRange empty(DataType dataType) {
+        return new StatisticRange(Double.NaN, Double.NaN, 0, dataType);
     }
 
     public boolean isEmpty() {
@@ -85,8 +90,8 @@ public class StatisticRange {
         return Double.isInfinite(low) && Double.isInfinite(high);
     }
 
-    public static StatisticRange from(ColumnStatistic column) {
-        return new StatisticRange(column.minValue, column.maxValue, column.ndv);
+    public static StatisticRange from(ColumnStatistic column, DataType dataType) {
+        return new StatisticRange(column.minValue, column.maxValue, column.ndv, dataType);
     }
 
     public double getLow() {
@@ -98,16 +103,16 @@ public class StatisticRange {
     }
 
     public double length() {
-        return this.high - this.low;
+        return dataType.rangeLength(this.high, this.low);
     }
 
     public StatisticRange intersect(StatisticRange other) {
         double newLow = Math.max(low, other.low);
         double newHigh = Math.min(high, other.high);
         if (newLow <= newHigh) {
-            return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other));
+            return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other), dataType);
         }
-        return empty();
+        return empty(dataType);
     }
 
     public StatisticRange cover(StatisticRange other) {
@@ -117,9 +122,9 @@ public class StatisticRange {
             double overlapPercentOfLeft = overlapPercentWith(other);
             double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues;
             double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft);
-            return new StatisticRange(newLow, newHigh, coveredDistinctValues);
+            return new StatisticRange(newLow, newHigh, coveredDistinctValues, dataType);
         }
-        return empty();
+        return empty(dataType);
     }
 
     public StatisticRange union(StatisticRange other) {
@@ -130,7 +135,7 @@ public class StatisticRange {
         double maxOverlapNDV = Math.max(overlapNDVThis, overlapNDVOther);
         double newNDV = maxOverlapNDV + ((1 - overlapPercentThis) * distinctValues)
                 + ((1 - overlapPercentOther) * other.distinctValues);
-        return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV);
+        return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV, dataType);
     }
 
     private double overlappingDistinctValues(StatisticRange other) {
@@ -168,7 +173,7 @@ public class StatisticRange {
         return distinctValues;
     }
 
-    public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic) {
-        return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv);
+    public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic, DataType dataType) {
+        return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv, dataType);
     }
 }
diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index dbe1d9a1a1..9452eb6ff8 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -29,8 +29,10 @@ import org.apache.doris.nereids.trees.expressions.LessThanEqual;
 import org.apache.doris.nereids.trees.expressions.Not;
 import org.apache.doris.nereids.trees.expressions.Or;
 import org.apache.doris.nereids.trees.expressions.SlotReference;
+import org.apache.doris.nereids.trees.expressions.literal.DateLiteral;
 import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
 import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral;
+import org.apache.doris.nereids.types.DateType;
 import org.apache.doris.nereids.types.DoubleType;
 import org.apache.doris.nereids.types.IntegerType;
 import org.apache.doris.statistics.ColumnStatistic;
@@ -867,4 +869,26 @@ class FilterEstimationTest {
         Assertions.assertTrue(colStats != null);
         Assertions.assertEquals(10, colStats.ndv, 0.1);
     }
+
+    @Test
+    public void testDateRangeSelectivity() {
+        DateLiteral from = new DateLiteral("1990-01-01");
+        DateLiteral to = new DateLiteral("2000-01-01");
+        SlotReference a = new SlotReference("a", DateType.INSTANCE);
+        ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+                .setNdv(100)
+                .setAvgSizeByte(4)
+                .setNumNulls(0)
+                .setMaxValue(to.getDouble())
+                .setMinValue(from.getDouble())
+                .setSelectivity(1.0)
+                .setCount(100);
+        DateLiteral mid = new DateLiteral("1999-01-01");
+        GreaterThan greaterThan = new GreaterThan(a, mid);
+        Statistics stats = new Statistics(100, new HashMap<>());
+        stats.addColumnStats(a, builder.build());
+        FilterEstimation filterEstimation = new FilterEstimation();
+        Statistics result = filterEstimation.estimate(greaterThan, stats);
+        Assertions.assertEquals(result.getRowCount(), 10, 0.1);
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org