You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by fe...@apache.org on 2017/01/10 19:42:24 UTC

spark git commit: [SPARK-19133][SPARKR][ML] fix glm for Gamma, clarify glm family supported

Repository: spark
Updated Branches:
  refs/heads/master d5b1dc934 -> 9bc3507e4


[SPARK-19133][SPARKR][ML] fix glm for Gamma, clarify glm family supported

## What changes were proposed in this pull request?

R family is a longer list than what Spark supports.

## How was this patch tested?

manual

Author: Felix Cheung <fe...@hotmail.com>

Closes #16511 from felixcheung/rdocglmfamily.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9bc3507e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9bc3507e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9bc3507e

Branch: refs/heads/master
Commit: 9bc3507e411b0ad9207e3053f80ac82f19b18f26
Parents: d5b1dc9
Author: Felix Cheung <fe...@hotmail.com>
Authored: Tue Jan 10 11:42:07 2017 -0800
Committer: Felix Cheung <fe...@apache.org>
Committed: Tue Jan 10 11:42:07 2017 -0800

----------------------------------------------------------------------
 R/pkg/R/mllib_regression.R                      |  7 +++++-
 R/pkg/R/sparkR.R                                |  2 +-
 .../inst/tests/testthat/test_mllib_regression.R | 26 +++++++++++++-------
 3 files changed, 24 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/9bc3507e/R/pkg/R/mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index a480168..0e07d3b 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -52,6 +52,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
 #'               This can be a character string naming a family function, a family function or
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
+#'               Currently these families are supported: \code{binomial}, \code{gaussian},
+#'               \code{Gamma}, and \code{poisson}.
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
@@ -104,8 +106,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
               weightCol <- ""
             }
 
+            # For known families, Gamma is upper-cased
             jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
-                                "fit", formula, data@sdf, family$family, family$link,
+                                "fit", formula, data@sdf, tolower(family$family), family$link,
                                 tol, as.integer(maxIter), as.character(weightCol), regParam)
             new("GeneralizedLinearRegressionModel", jobj = jobj)
           })
@@ -120,6 +123,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
 #'               This can be a character string naming a family function, a family function or
 #'               the result of a call to a family function. Refer R family at
 #'               \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
+#'               Currently these families are supported: \code{binomial}, \code{gaussian},
+#'               \code{Gamma}, and \code{poisson}.
 #' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
 #'                  weights as 1.0.
 #' @param epsilon positive convergence tolerance of iterations.

http://git-wip-us.apache.org/repos/asf/spark/blob/9bc3507e/R/pkg/R/sparkR.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index e9d42c1..870e76b 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -423,7 +423,7 @@ sparkR.session <- function(
 #' sparkR.session()
 #' url <- sparkR.uiWebUrl()
 #' }
-#' @note sparkR.uiWebUrl since 2.2.0
+#' @note sparkR.uiWebUrl since 2.1.1
 sparkR.uiWebUrl <- function() {
   sc <- sparkR.callJMethod(getSparkContext(), "sc")
   u <- callJMethod(sc, "uiWebUrl")

http://git-wip-us.apache.org/repos/asf/spark/blob/9bc3507e/R/pkg/inst/tests/testthat/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index e20dafa..c450a15 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -61,14 +61,22 @@ test_that("spark.glm and predict", {
 
   # poisson family
   model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-  family = poisson(link = identity))
+                     family = poisson(link = identity))
   prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
   rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
-  data = iris, family = poisson(link = identity)), iris))
+                                        data = iris, family = poisson(link = identity)), iris))
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
+  # Gamma family
+  x <- runif(100, -1, 1)
+  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
+  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
+  model <- glm(y ~ x, family = Gamma, df)
+  out <- capture.output(print(summary(model)))
+  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+
   # Test stats::predict is working
   x <- rnorm(15)
   y <- x + rnorm(15)
@@ -103,11 +111,11 @@ test_that("spark.glm summary", {
   df <- suppressWarnings(createDataFrame(iris))
   training <- df[df$Species %in% c("versicolor", "virginica"), ]
   stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
-    family = binomial(link = "logit")))
+                             family = binomial(link = "logit")))
 
   rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
   rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
-  family = binomial(link = "logit")))
+                        family = binomial(link = "logit")))
 
   coefs <- unlist(stats$coefficients)
   rCoefs <- unlist(rStats$coefficients)
@@ -222,7 +230,7 @@ test_that("glm and predict", {
   training <- suppressWarnings(createDataFrame(iris))
   # gaussian family
   model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
-  prediction <- predict(model, training)
+               prediction <- predict(model, training)
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
   rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
@@ -235,7 +243,7 @@ test_that("glm and predict", {
   expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
   vals <- collect(select(prediction, "prediction"))
   rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
-           data = iris, family = poisson(link = identity)), iris))
+                                        data = iris, family = poisson(link = identity)), iris))
   expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
 
   # Test stats::predict is working
@@ -268,11 +276,11 @@ test_that("glm summary", {
   df <- suppressWarnings(createDataFrame(iris))
   training <- df[df$Species %in% c("versicolor", "virginica"), ]
   stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
-    family = binomial(link = "logit")))
+                       family = binomial(link = "logit")))
 
   rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
   rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
-    family = binomial(link = "logit")))
+                        family = binomial(link = "logit")))
 
   coefs <- unlist(stats$coefficients)
   rCoefs <- unlist(rStats$coefficients)
@@ -409,7 +417,7 @@ test_that("spark.survreg", {
                  x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
     expect_error(
       model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
-      NA)
+                                 NA)
     expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
   }
 })


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org