You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/11/02 18:49:34 UTC
spark git commit: [SPARK-18111][SQL] Wrong approximate quantile
answer when multiple records have the minimum value(for branch 2.0)
Repository: spark
Updated Branches:
refs/heads/branch-2.0 1696bcfad -> 3253ae7f7
[SPARK-18111][SQL] Wrong approximate quantile answer when multiple records have the minimum value(for branch 2.0)
## What changes were proposed in this pull request?
When multiple records have the minimum value, the answer of `StatFunctions.multipleApproxQuantiles` is wrong.
## How was this patch tested?
add a test case
Author: wangzhenhua <wa...@huawei.com>
Closes #15732 from wzhfy/percentile2.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3253ae7f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3253ae7f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3253ae7f
Branch: refs/heads/branch-2.0
Commit: 3253ae7f722a996cf0af21608e1a27d5d2a12004
Parents: 1696bcf
Author: wangzhenhua <wa...@huawei.com>
Authored: Wed Nov 2 11:49:30 2016 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Wed Nov 2 11:49:30 2016 -0700
----------------------------------------------------------------------
.../spark/sql/execution/stat/StatFunctions.scala | 4 +++-
.../org/apache/spark/sql/DataFrameStatSuite.scala | 13 +++++++++++++
2 files changed, 16 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index 7e2ebe8..acc42a0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -337,7 +337,9 @@ object StatFunctions extends Logging {
res.prepend(head)
// If necessary, add the minimum element:
val currHead = currentSamples.head
- if (currHead.value < head.value) {
+ // don't add the minimum element if `currentSamples` has only one element (both `currHead` and
+ // `head` point to the same element)
+ if (currHead.value <= head.value && currentSamples.length > 1) {
res.prepend(currentSamples.head)
}
res.toArray
http://git-wip-us.apache.org/repos/asf/spark/blob/3253ae7f/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 73026c7..571e2ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -152,6 +152,19 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
}
}
+ test("approximate quantile, multiple records with the minimum value in a partition") {
+ val data = Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5)
+ val df = spark.sparkContext.makeRDD(data, 4).toDF("col")
+ val epsilons = List(0.1, 0.05, 0.001)
+ val quantile = 0.5
+ val expected = 1
+ for (epsilon <- epsilons) {
+ val Array(answer) = df.stat.approxQuantile("col", Array(quantile), epsilon)
+ val error = 2 * data.length * epsilon
+ assert(math.abs(answer - expected) < error)
+ }
+ }
+
test("crosstab") {
val rng = new Random()
val data = Seq.tabulate(25)(i => (rng.nextInt(5), rng.nextInt(10)))
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org