You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2018/04/26 11:07:19 UTC
spark git commit: [SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong stats for STRING

Repository: spark
Updated Branches:
  refs/heads/master d1eb8d3dd -> ce2f919f8


[SPARK-23799][SQL][FOLLOW-UP] FilterEstimation.evaluateInSet produces wrong stats for STRING

## What changes were proposed in this pull request?
`colStat.min` AND `colStat.max` are empty for string type. Thus, `evaluateInSet` should not return zero when either `colStat.min` or `colStat.max`.

## How was this patch tested?
Added a test case.

Author: gatorsmile <ga...@gmail.com>

Closes #21147 from gatorsmile/cached.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce2f919f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce2f919f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce2f919f

Branch: refs/heads/master
Commit: ce2f919f8df1b794ceaa23e1a59d5d541ed47bf5
Parents: d1eb8d3
Author: gatorsmile <ga...@gmail.com>
Authored: Thu Apr 26 19:07:13 2018 +0800
Committer: Wenchen Fan <we...@databricks.com>
Committed: Thu Apr 26 19:07:13 2018 +0800

----------------------------------------------------------------------
 .../logical/statsEstimation/FilterEstimation.scala      | 12 ++++++++----
 .../statsEstimation/FilterEstimationSuite.scala         | 12 ++++++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ce2f919f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 263c9ba..5a3eeef 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -392,13 +392,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
     val dataType = attr.dataType
     var newNdv = ndv
 
-    if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty)  {
-      return Some(0.0)
-    }
-
     // use [min, max] to filter the original hSet
     dataType match {
       case _: NumericType | BooleanType | DateType | TimestampType =>
+        if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty)  {
+          return Some(0.0)
+        }
+
         val statsInterval =
           ValueInterval(colStat.min, colStat.max, dataType).asInstanceOf[NumericValueInterval]
         val validQuerySet = hSet.filter { v =>
@@ -422,6 +422,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
       // We assume the whole set since there is no min/max information for String/Binary type
       case StringType | BinaryType =>
+        if (ndv.toDouble == 0)  {
+          return Some(0.0)
+        }
+
         newNdv = ndv.min(BigInt(hSet.size))
         if (update) {
           val newStats = colStat.copy(distinctCount = Some(newNdv), nullCount = Some(0))

http://git-wip-us.apache.org/repos/asf/spark/blob/ce2f919f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 16cb5d0..47bfa62 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -368,6 +368,18 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 0)
   }
 
+  test("evaluateInSet with string") {
+    validateEstimatedStats(
+      Filter(InSet(attrString, Set("A0")),
+        StatsTestPlan(Seq(attrString), 10,
+          AttributeMap(Seq(attrString ->
+            ColumnStat(distinctCount = Some(10), min = None, max = None,
+              nullCount = Some(0), avgLen = Some(2), maxLen = Some(2)))))),
+      Seq(attrString -> ColumnStat(distinctCount = Some(1), min = None, max = None,
+        nullCount = Some(0), avgLen = Some(2), maxLen = Some(2))),
+      expectedRowCount = 1)
+  }
+
   test("cint NOT IN (3, 4, 5)") {
     validateEstimatedStats(
       Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org