You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2016/11/01 13:11:29 UTC
spark git commit: [SPARK-18111][SQL] Wrong ApproximatePercentile
answer when multiple records have the minimum value
Repository: spark
Updated Branches:
refs/heads/master 623fc7fc6 -> cb80edc26
[SPARK-18111][SQL] Wrong ApproximatePercentile answer when multiple records have the minimum value
## What changes were proposed in this pull request?
When multiple records have the minimum value, the answer of ApproximatePercentile is wrong.
## How was this patch tested?
add a test case
Author: wangzhenhua <wa...@huawei.com>
Closes #15641 from wzhfy/percentile.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cb80edc2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cb80edc2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cb80edc2
Branch: refs/heads/master
Commit: cb80edc26349e2e358d27fe2ae8e5d6959b77fab
Parents: 623fc7f
Author: wangzhenhua <wa...@huawei.com>
Authored: Tue Nov 1 13:11:24 2016 +0000
Committer: Sean Owen <so...@cloudera.com>
Committed: Tue Nov 1 13:11:24 2016 +0000
----------------------------------------------------------------------
.../spark/sql/catalyst/util/QuantileSummaries.scala | 4 +++-
.../spark/sql/ApproximatePercentileQuerySuite.scala | 11 +++++++++++
2 files changed, 14 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/cb80edc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
index 27928c4..04f4ff2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
@@ -264,7 +264,9 @@ object QuantileSummaries {
res.prepend(head)
// If necessary, add the minimum element:
val currHead = currentSamples.head
- if (currHead.value < head.value) {
+ // don't add the minimum element if `currentSamples` has only one element (both `currHead` and
+ // `head` point to the same element)
+ if (currHead.value <= head.value && currentSamples.length > 1) {
res.prepend(currentSamples.head)
}
res.toArray
http://git-wip-us.apache.org/repos/asf/spark/blob/cb80edc2/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
index 37d7c44..e98092d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
@@ -64,6 +64,17 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext {
}
}
+ test("percentile_approx, multiple records with the minimum value in a partition") {
+ withTempView(table) {
+ spark.sparkContext.makeRDD(Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5), 4).toDF("col")
+ .createOrReplaceTempView(table)
+ checkAnswer(
+ spark.sql(s"SELECT percentile_approx(col, array(0.5)) FROM $table"),
+ Row(Seq(1.0D))
+ )
+ }
+ }
+
test("percentile_approx, with different accuracies") {
withTempView(table) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org