You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/04/26 11:07:19 UTC
spark git commit: [SPARK-23799][SQL][FOLLOW-UP]
FilterEstimation.evaluateInSet produces wrong stats for STRING
Repository: spark
Updated Branches:
refs/heads/master d1eb8d3dd -> ce2f919f8
[SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong stats for STRING
## What changes were proposed in this pull request?
`colStat.min` AND `colStat.max` are empty for string type. Thus, `evaluateInSet` should not return zero when either `colStat.min` or `colStat.max`.
## How was this patch tested?
Added a test case.
Author: gatorsmile <ga...@gmail.com>
Closes #21147 from gatorsmile/cached.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce2f919f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce2f919f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce2f919f
Branch: refs/heads/master
Commit: ce2f919f8df1b794ceaa23e1a59d5d541ed47bf5
Parents: d1eb8d3
Author: gatorsmile <ga...@gmail.com>
Authored: Thu Apr 26 19:07:13 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Thu Apr 26 19:07:13 2018 +0800
----------------------------------------------------------------------
.../logical/statsEstimation/FilterEstimation.scala | 12 ++++++++----
.../statsEstimation/FilterEstimationSuite.scala | 12 ++++++++++++
2 files changed, 20 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/ce2f919f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 263c9ba..5a3eeef 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -392,13 +392,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
val dataType = attr.dataType
var newNdv = ndv
- if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
- return Some(0.0)
- }
-
// use [min, max] to filter the original hSet
dataType match {
case _: NumericType | BooleanType | DateType | TimestampType =>
+ if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
+ return Some(0.0)
+ }
+
val statsInterval =
ValueInterval(colStat.min, colStat.max, dataType).asInstanceOf[NumericValueInterval]
val validQuerySet = hSet.filter { v =>
@@ -422,6 +422,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
// We assume the whole set since there is no min/max information for String/Binary type
case StringType | BinaryType =>
+ if (ndv.toDouble == 0) {
+ return Some(0.0)
+ }
+
newNdv = ndv.min(BigInt(hSet.size))
if (update) {
val newStats = colStat.copy(distinctCount = Some(newNdv), nullCount = Some(0))
http://git-wip-us.apache.org/repos/asf/spark/blob/ce2f919f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 16cb5d0..47bfa62 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -368,6 +368,18 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
expectedRowCount = 0)
}
+ test("evaluateInSet with string") {
+ validateEstimatedStats(
+ Filter(InSet(attrString, Set("A0")),
+ StatsTestPlan(Seq(attrString), 10,
+ AttributeMap(Seq(attrString ->
+ ColumnStat(distinctCount = Some(10), min = None, max = None,
+ nullCount = Some(0), avgLen = Some(2), maxLen = Some(2)))))),
+ Seq(attrString -> ColumnStat(distinctCount = Some(1), min = None, max = None,
+ nullCount = Some(0), avgLen = Some(2), maxLen = Some(2))),
+ expectedRowCount = 1)
+ }
+
test("cint NOT IN (3, 4, 5)") {
validateEstimatedStats(
Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org