You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by en...@apache.org on 2023/04/04 06:20:41 UTC
[doris] branch master updated: [improve](nereids)compute statsRange.length() according to the column datatype (#18331)
This is an automated email from the ASF dual-hosted git repository.
englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3fc8c19735 [improve](nereids)compute statsRange.length() according to the column datatype (#18331)
3fc8c19735 is described below
commit 3fc8c19735e55bee1e475124d42b155a1dea1d8e
Author: minghong <en...@gmail.com>
AuthorDate: Tue Apr 4 14:20:34 2023 +0800
[improve](nereids)compute statsRange.length() according to the column datatype (#18331)
we map date/datetime/V2 to double. this map reserves date order, but it does not reserve range length.
For example, from 1990-01-01 to 1991-01-01, there are 12 months. for filter `A < 1990-02-01`, the selectivity
should be `1/12`.
if we compute this filter by their corresponding double value,
`sel = (19900201 - 19900101) / (19910101 - 19900101) = 100/10000 = 1/100`
the error is about 10 times.
This pr aims to fix this error.
Describe your changes.
Solution:
convert double to its corresponding dataType(date/datev2), then compute the range length with respect to its datatype.
---
.../doris/nereids/stats/FilterEstimation.java | 19 +++++++------
.../doris/nereids/stats/StatsCalculator.java | 9 +++---
.../trees/expressions/literal/DateLiteral.java | 5 ++++
.../trees/expressions/literal/DateTimeLiteral.java | 5 ++++
.../org/apache/doris/nereids/types/DataType.java | 4 +++
.../doris/nereids/types/coercion/DateLikeType.java | 22 +++++++++++++++
.../apache/doris/statistics/StatisticRange.java | 33 +++++++++++++---------
.../doris/nereids/stats/FilterEstimationTest.java | 24 ++++++++++++++++
8 files changed, 94 insertions(+), 27 deletions(-)
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index 7cebe8f81d..0fd9f1e9b1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -176,7 +176,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains);
}
//rightRange.distinctValues should not be used
- StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv);
+ StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv,
+ leftExpr.getDataType());
return estimateBinaryComparisonFilter(leftExpr,
statsForLeft,
rightRange, context);
@@ -189,7 +190,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
}
//rightRange.distinctValues should not be used
StatisticRange rightRange = new StatisticRange(val, statsForLeft.maxValue,
- statsForLeft.ndv);
+ statsForLeft.ndv, leftExpr.getDataType());
return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context);
}
@@ -360,7 +361,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats,
StatisticRange rightRange, EstimationContext context) {
StatisticRange leftRange =
- new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv);
+ new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv, leftExpr.getDataType());
StatisticRange intersectRange = leftRange.cover(rightRange);
ColumnStatisticBuilder leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
.setMinValue(intersectRange.getLow())
@@ -375,8 +376,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatistic leftStats,
Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
- StatisticRange leftRange = StatisticRange.from(leftStats);
- StatisticRange rightRange = StatisticRange.from(rightStats);
+ StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
+ StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
StatisticRange leftIntersectRight = leftRange.intersect(rightRange);
StatisticRange rightIntersectLeft = rightRange.intersect(leftIntersectRight);
ColumnStatisticBuilder leftBuilder = new ColumnStatisticBuilder(leftStats);
@@ -396,8 +397,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStatistic leftStats,
Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
- StatisticRange leftRange = StatisticRange.from(leftStats);
- StatisticRange rightRange = StatisticRange.from(rightStats);
+ StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
+ StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
Statistics statistics = null;
// Left always less than Right
if (leftRange.getHigh() < rightRange.getLow()) {
@@ -414,7 +415,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
return context.statistics.withRowCount(0.0);
}
StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue,
- rightStats.minValue, Double.NaN);
+ rightStats.minValue, Double.NaN, leftExpr.getDataType());
double leftAlwaysLessThanRightPercent = 0;
if (leftRange.getLow() < rightRange.getLow()) {
leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange);
@@ -429,7 +430,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
double rightAlwaysGreaterRangeFraction = 0;
if (leftRange.getHigh() < rightRange.getHigh()) {
rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange(leftRange.getHigh(),
- rightRange.getHigh(), Double.NaN));
+ rightRange.getHigh(), Double.NaN, rightExpr.getDataType()));
}
ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats)
.setMinValue(Math.max(leftRange.getLow(), rightRange.getLow()))
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index 6bae008b24..942c19a6ac 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -89,6 +89,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalTopN;
import org.apache.doris.nereids.trees.plans.physical.PhysicalUnion;
import org.apache.doris.nereids.trees.plans.physical.PhysicalWindow;
import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanVisitor;
+import org.apache.doris.nereids.types.DataType;
import org.apache.doris.statistics.ColumnLevelStatisticCache;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
@@ -577,7 +578,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
double rightRowCount = childStats.get(j).getRowCount();
ColumnStatistic estimatedColumnStatistics
= unionColumn(headStats.findColumnStatistics(headSlot),
- headStats.getRowCount(), rightStatistic, rightRowCount);
+ headStats.getRowCount(), rightStatistic, rightRowCount, headSlot.getDataType());
headStats.addColumnStats(headSlot, estimatedColumnStatistics);
leftRowCount += childStats.get(j).getRowCount();
}
@@ -692,12 +693,12 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
}
private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats,
- double rightRowCount) {
+ double rightRowCount, DataType dataType) {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
columnStatisticBuilder.setMaxValue(Math.max(leftStats.maxValue, rightStats.maxValue));
columnStatisticBuilder.setMinValue(Math.min(leftStats.minValue, rightStats.minValue));
- StatisticRange leftRange = StatisticRange.from(leftStats);
- StatisticRange rightRange = StatisticRange.from(rightStats);
+ StatisticRange leftRange = StatisticRange.from(leftStats, dataType);
+ StatisticRange rightRange = StatisticRange.from(rightStats, dataType);
StatisticRange newRange = leftRange.union(rightRange);
double newRowCount = leftRowCount + rightRowCount;
double leftSize = (leftRowCount - leftStats.numNulls) * leftStats.avgSizeByte;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java
index f401d73b0c..da3dd4dfc3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java
@@ -148,6 +148,11 @@ public class DateLiteral extends Literal {
return (year * 10000 + month * 100 + day) * 1000000L;
}
+ @Override
+ public double getDouble() {
+ return (double) getValue();
+ }
+
@Override
public String getStringValue() {
return String.format("%04d-%02d-%02d", year, month, day);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java
index a0a9abbd81..c97e1612e3 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java
@@ -247,6 +247,11 @@ public class DateTimeLiteral extends DateLiteral {
return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second;
}
+ @Override
+ public double getDouble() {
+ return (double) getValue();
+ }
+
@Override
public String toSql() {
return toString();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
index f14c9e0e43..5e6706e9b0 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
@@ -587,4 +587,8 @@ public abstract class DataType implements AbstractDataType {
return (Map) ImmutableMap.copyOf(promotionMap);
}
}
+
+ public double rangeLength(double high, double low) {
+ return high - low;
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
index ee491fa18d..7defd1b61f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
@@ -17,8 +17,30 @@
package org.apache.doris.nereids.types.coercion;
+import java.time.temporal.ChronoUnit;
+import java.util.Calendar;
+
/**
* date like type.
*/
public abstract class DateLikeType extends PrimitiveType {
+ private Calendar toCalendar(double d) {
+ //d = (year * 10000 + month * 100 + day) * 1000000L;
+ int date = (int) (d / 1000000);
+ int day = date % 100;
+ int month = (date / 100) % 100;
+ int year = date / 10000;
+ Calendar calendar = Calendar.getInstance();
+ calendar.set(Calendar.YEAR, year);
+ calendar.set(Calendar.MONDAY, month);
+ calendar.set(Calendar.DAY_OF_MONTH, day);
+ return calendar;
+ }
+
+ @Override
+ public double rangeLength(double high, double low) {
+ Calendar to = toCalendar(high);
+ Calendar from = toCalendar(low);
+ return ChronoUnit.DAYS.between(from.toInstant(), to.toInstant());
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java
index bbc95169e9..74055e62f1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java
@@ -17,6 +17,8 @@
package org.apache.doris.statistics;
+import org.apache.doris.nereids.types.DataType;
+
import java.util.Objects;
public class StatisticRange {
@@ -34,10 +36,13 @@ public class StatisticRange {
private final double distinctValues;
- public StatisticRange(double low, double high, double distinctValues) {
+ private final DataType dataType;
+
+ public StatisticRange(double low, double high, double distinctValues, DataType dataType) {
this.low = low;
this.high = high;
this.distinctValues = distinctValues;
+ this.dataType = dataType;
}
public double overlapPercentWith(StatisticRange other) {
@@ -50,7 +55,7 @@ public class StatisticRange {
return 1.0;
}
- double lengthOfIntersect = Math.min(this.high, other.high) - Math.max(this.low, other.low);
+ double lengthOfIntersect = dataType.rangeLength(Math.min(this.high, other.high), Math.max(this.low, other.low));
if (Double.isInfinite(lengthOfIntersect)) {
if (Double.isFinite(this.distinctValues) && Double.isFinite(other.distinctValues)) {
return Math.min(other.distinctValues / this.distinctValues, 1);
@@ -73,8 +78,8 @@ public class StatisticRange {
return INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
}
- public static StatisticRange empty() {
- return new StatisticRange(Double.NaN, Double.NaN, 0);
+ public static StatisticRange empty(DataType dataType) {
+ return new StatisticRange(Double.NaN, Double.NaN, 0, dataType);
}
public boolean isEmpty() {
@@ -85,8 +90,8 @@ public class StatisticRange {
return Double.isInfinite(low) && Double.isInfinite(high);
}
- public static StatisticRange from(ColumnStatistic column) {
- return new StatisticRange(column.minValue, column.maxValue, column.ndv);
+ public static StatisticRange from(ColumnStatistic column, DataType dataType) {
+ return new StatisticRange(column.minValue, column.maxValue, column.ndv, dataType);
}
public double getLow() {
@@ -98,16 +103,16 @@ public class StatisticRange {
}
public double length() {
- return this.high - this.low;
+ return dataType.rangeLength(this.high, this.low);
}
public StatisticRange intersect(StatisticRange other) {
double newLow = Math.max(low, other.low);
double newHigh = Math.min(high, other.high);
if (newLow <= newHigh) {
- return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other));
+ return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other), dataType);
}
- return empty();
+ return empty(dataType);
}
public StatisticRange cover(StatisticRange other) {
@@ -117,9 +122,9 @@ public class StatisticRange {
double overlapPercentOfLeft = overlapPercentWith(other);
double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues;
double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft);
- return new StatisticRange(newLow, newHigh, coveredDistinctValues);
+ return new StatisticRange(newLow, newHigh, coveredDistinctValues, dataType);
}
- return empty();
+ return empty(dataType);
}
public StatisticRange union(StatisticRange other) {
@@ -130,7 +135,7 @@ public class StatisticRange {
double maxOverlapNDV = Math.max(overlapNDVThis, overlapNDVOther);
double newNDV = maxOverlapNDV + ((1 - overlapPercentThis) * distinctValues)
+ ((1 - overlapPercentOther) * other.distinctValues);
- return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV);
+ return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV, dataType);
}
private double overlappingDistinctValues(StatisticRange other) {
@@ -168,7 +173,7 @@ public class StatisticRange {
return distinctValues;
}
- public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic) {
- return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv);
+ public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic, DataType dataType) {
+ return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv, dataType);
}
}
diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index dbe1d9a1a1..9452eb6ff8 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -29,8 +29,10 @@ import org.apache.doris.nereids.trees.expressions.LessThanEqual;
import org.apache.doris.nereids.trees.expressions.Not;
import org.apache.doris.nereids.trees.expressions.Or;
import org.apache.doris.nereids.trees.expressions.SlotReference;
+import org.apache.doris.nereids.trees.expressions.literal.DateLiteral;
import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral;
+import org.apache.doris.nereids.types.DateType;
import org.apache.doris.nereids.types.DoubleType;
import org.apache.doris.nereids.types.IntegerType;
import org.apache.doris.statistics.ColumnStatistic;
@@ -867,4 +869,26 @@ class FilterEstimationTest {
Assertions.assertTrue(colStats != null);
Assertions.assertEquals(10, colStats.ndv, 0.1);
}
+
+ @Test
+ public void testDateRangeSelectivity() {
+ DateLiteral from = new DateLiteral("1990-01-01");
+ DateLiteral to = new DateLiteral("2000-01-01");
+ SlotReference a = new SlotReference("a", DateType.INSTANCE);
+ ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
+ .setNdv(100)
+ .setAvgSizeByte(4)
+ .setNumNulls(0)
+ .setMaxValue(to.getDouble())
+ .setMinValue(from.getDouble())
+ .setSelectivity(1.0)
+ .setCount(100);
+ DateLiteral mid = new DateLiteral("1999-01-01");
+ GreaterThan greaterThan = new GreaterThan(a, mid);
+ Statistics stats = new Statistics(100, new HashMap<>());
+ stats.addColumnStats(a, builder.build());
+ FilterEstimation filterEstimation = new FilterEstimation();
+ Statistics result = filterEstimation.estimate(greaterThan, stats);
+ Assertions.assertEquals(result.getRowCount(), 10, 0.1);
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org