You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sh...@apache.org on 2016/01/27 04:29:50 UTC

spark git commit: [SPARK-12903][SPARKR] Add covar_samp and covar_pop for SparkR

Repository: spark
Updated Branches:
  refs/heads/master b72611f20 -> e7f9199e7


[SPARK-12903][SPARKR] Add covar_samp and covar_pop for SparkR

Add ```covar_samp``` and ```covar_pop``` for SparkR.
Should we also provide ```cov``` alias for ```covar_samp```? There is ```cov``` implementation at stats.R which masks ```stats::cov``` already, but may bring to breaking API change.

cc sun-rui felixcheung shivaram

Author: Yanbo Liang <yb...@gmail.com>

Closes #10829 from yanboliang/spark-12903.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7f9199e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7f9199e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7f9199e

Branch: refs/heads/master
Commit: e7f9199e709c46a6b5ad6b03c9ecf12cc19e3a41
Parents: b72611f
Author: Yanbo Liang <yb...@gmail.com>
Authored: Tue Jan 26 19:29:47 2016 -0800
Committer: Shivaram Venkataraman <sh...@cs.berkeley.edu>
Committed: Tue Jan 26 19:29:47 2016 -0800

----------------------------------------------------------------------
 R/pkg/NAMESPACE                           |  2 +
 R/pkg/R/functions.R                       | 58 ++++++++++++++++++++++++++
 R/pkg/R/generics.R                        | 10 ++++-
 R/pkg/R/stats.R                           |  3 +-
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  2 +
 5 files changed, 73 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e7f9199e/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 2cc1544..f194a46 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -35,6 +35,8 @@ exportMethods("arrange",
               "count",
               "cov",
               "corr",
+              "covar_samp",
+              "covar_pop",
               "crosstab",
               "describe",
               "dim",

http://git-wip-us.apache.org/repos/asf/spark/blob/e7f9199e/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9bb7876..8f8651c 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -275,6 +275,64 @@ setMethod("corr", signature(x = "Column"),
             column(jc)
           })
 
+#' cov
+#'
+#' Compute the sample covariance between two expressions.
+#'
+#' @rdname cov
+#' @name cov
+#' @family math_funcs
+#' @export
+#' @examples
+#' \dontrun{
+#' cov(df$c, df$d)
+#' cov("c", "d")
+#' covar_samp(df$c, df$d)
+#' covar_samp("c", "d")
+#' }
+setMethod("cov", signature(x = "characterOrColumn"),
+          function(x, col2) {
+            stopifnot(is(class(col2), "characterOrColumn"))
+            covar_samp(x, col2)
+          })
+
+#' @rdname cov
+#' @name covar_samp
+setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"),
+          function(col1, col2) {
+            stopifnot(class(col1) == class(col2))
+            if (class(col1) == "Column") {
+              col1 <- col1@jc
+              col2 <- col2@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "covar_samp", col1, col2)
+            column(jc)
+          })
+
+#' covar_pop
+#'
+#' Compute the population covariance between two expressions.
+#'
+#' @rdname covar_pop
+#' @name covar_pop
+#' @family math_funcs
+#' @export
+#' @examples
+#' \dontrun{
+#' covar_pop(df$c, df$d)
+#' covar_pop("c", "d")
+#' }
+setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"),
+          function(col1, col2) {
+            stopifnot(class(col1) == class(col2))
+            if (class(col1) == "Column") {
+              col1 <- col1@jc
+              col2 <- col2@jc
+            }
+            jc <- callJStatic("org.apache.spark.sql.functions", "covar_pop", col1, col2)
+            column(jc)
+          })
+
 #' cos
 #'
 #' Computes the cosine of the given value.

http://git-wip-us.apache.org/repos/asf/spark/blob/e7f9199e/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 04784d5..2dba71a 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -418,12 +418,20 @@ setGeneric("columns", function(x) {standardGeneric("columns") })
 
 #' @rdname statfunctions
 #' @export
-setGeneric("cov", function(x, col1, col2) {standardGeneric("cov") })
+setGeneric("cov", function(x, ...) {standardGeneric("cov") })
 
 #' @rdname statfunctions
 #' @export
 setGeneric("corr", function(x, ...) {standardGeneric("corr") })
 
+#' @rdname statfunctions
+#' @export
+setGeneric("covar_samp", function(col1, col2) {standardGeneric("covar_samp") })
+
+#' @rdname statfunctions
+#' @export
+setGeneric("covar_pop", function(col1, col2) {standardGeneric("covar_pop") })
+
 #' @rdname summary
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })

http://git-wip-us.apache.org/repos/asf/spark/blob/e7f9199e/R/pkg/R/stats.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index d17cce9..2e80768 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -66,8 +66,9 @@ setMethod("crosstab",
 #' cov <- cov(df, "title", "gender")
 #' }
 setMethod("cov",
-          signature(x = "DataFrame", col1 = "character", col2 = "character"),
+          signature(x = "DataFrame"),
           function(x, col1, col2) {
+            stopifnot(class(col1) == "character" && class(col2) == "character")
             statFunctions <- callJMethod(x@sdf, "stat")
             callJMethod(statFunctions, "cov", col1, col2)
           })

http://git-wip-us.apache.org/repos/asf/spark/blob/e7f9199e/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index b52a11f..7b57137 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -996,6 +996,8 @@ test_that("column functions", {
   c14 <- cume_dist() + ntile(1) + corr(c, c1)
   c15 <- dense_rank() + percent_rank() + rank() + row_number()
   c16 <- is.nan(c) + isnan(c) + isNaN(c)
+  c17 <- cov(c, c1) + cov("c", "c1") + covar_samp(c, c1) + covar_samp("c", "c1")
+  c18 <- covar_pop(c, c1) + covar_pop("c", "c1")
 
   # Test if base::is.nan() is exposed
   expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org