You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sh...@apache.org on 2015/12/12 05:55:19 UTC
spark git commit: [SPARK-12158][SPARKR][SQL] Fix 'sample' functions
that break R unit test cases
Repository: spark
Updated Branches:
refs/heads/master 1e799d617 -> 1e3526c2d
[SPARK-12158][SPARKR][SQL] Fix 'sample' functions that break R unit test cases
The existing sample functions miss the parameter `seed`, however, the corresponding function interface in `generics` has such a parameter. Thus, although the function caller can call the function with the 'seed', we are not using the value.
This could cause SparkR unit tests failed. For example, I hit it in another PR:
https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/47213/consoleFull
Author: gatorsmile <ga...@gmail.com>
Closes #10160 from gatorsmile/sampleR.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1e3526c2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1e3526c2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1e3526c2
Branch: refs/heads/master
Commit: 1e3526c2d3de723225024fedd45753b556e18fc6
Parents: 1e799d6
Author: gatorsmile <ga...@gmail.com>
Authored: Fri Dec 11 20:55:16 2015 -0800
Committer: Shivaram Venkataraman <sh...@cs.berkeley.edu>
Committed: Fri Dec 11 20:55:16 2015 -0800
----------------------------------------------------------------------
R/pkg/R/DataFrame.R | 17 +++++++++++------
R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++++
2 files changed, 15 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/1e3526c2/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 975b058..764597d 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -662,6 +662,7 @@ setMethod("unique",
#' @param x A SparkSQL DataFrame
#' @param withReplacement Sampling with replacement or not
#' @param fraction The (rough) sample target fraction
+#' @param seed Randomness seed value
#'
#' @family DataFrame functions
#' @rdname sample
@@ -677,13 +678,17 @@ setMethod("unique",
#' collect(sample(df, TRUE, 0.5))
#'}
setMethod("sample",
- # TODO : Figure out how to send integer as java.lang.Long to JVM so
- # we can send seed as an argument through callJMethod
signature(x = "DataFrame", withReplacement = "logical",
fraction = "numeric"),
- function(x, withReplacement, fraction) {
+ function(x, withReplacement, fraction, seed) {
if (fraction < 0.0) stop(cat("Negative fraction value:", fraction))
- sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction)
+ if (!missing(seed)) {
+ # TODO : Figure out how to send integer as java.lang.Long to JVM so
+ # we can send seed as an argument through callJMethod
+ sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction, as.integer(seed))
+ } else {
+ sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction)
+ }
dataFrame(sdf)
})
@@ -692,8 +697,8 @@ setMethod("sample",
setMethod("sample_frac",
signature(x = "DataFrame", withReplacement = "logical",
fraction = "numeric"),
- function(x, withReplacement, fraction) {
- sample(x, withReplacement, fraction)
+ function(x, withReplacement, fraction, seed) {
+ sample(x, withReplacement, fraction, seed)
})
#' nrow
http://git-wip-us.apache.org/repos/asf/spark/blob/1e3526c2/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index ed9b2c9..071fd31 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -724,6 +724,10 @@ test_that("sample on a DataFrame", {
sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result
expect_true(count(sampled2) < 3)
+ count1 <- count(sample(df, FALSE, 0.1, 0))
+ count2 <- count(sample(df, FALSE, 0.1, 0))
+ expect_equal(count1, count2)
+
# Also test sample_frac
sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result
expect_true(count(sampled3) < 3)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org