You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by fe...@apache.org on 2017/01/31 20:20:47 UTC
spark git commit: [SPARK-19395][SPARKR] Convert coefficients in summary to matrix

Repository: spark
Updated Branches:
  refs/heads/master 26a4cba3f -> ce112cec4


[SPARK-19395][SPARKR] Convert coefficients in summary to matrix

## What changes were proposed in this pull request?
The `coefficients` component in model summary should be 'matrix' but the underlying structure is indeed list. This affects several models except for 'AFTSurvivalRegressionModel' which has the correct implementation. The fix is to first `unlist` the coefficients returned from the `callJMethod` before converting to matrix. An example illustrates the issues:

```
data(iris)
df <- createDataFrame(iris)
model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
s <- summary(model)

> str(s$coefficients)
List of 8
 $ : num 6.53
 $ : num -0.223
 $ : num 0.479
 $ : num 0.155
 $ : num 13.6
 $ : num -1.44
 $ : num 0
 $ : num 0.152
 - attr(*, "dim")= int [1:2] 2 4
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:2] "(Intercept)" "Sepal_Width"
  ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
> s$coefficients[, 2]
$`(Intercept)`
[1] 0.4788963

$Sepal_Width
[1] 0.1550809
```

This  shows that the underlying structure of coefficients is still `list`.

felixcheung wangmiao1981

Author: actuaryzhang <ac...@gmail.com>

Closes #16730 from actuaryzhang/sparkRCoef.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce112cec
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce112cec
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce112cec

Branch: refs/heads/master
Commit: ce112cec4f9bff222aa256893f94c316662a2a7e
Parents: 26a4cba
Author: actuaryzhang <ac...@gmail.com>
Authored: Tue Jan 31 12:20:43 2017 -0800
Committer: Felix Cheung <fe...@apache.org>
Committed: Tue Jan 31 12:20:43 2017 -0800

----------------------------------------------------------------------
 R/pkg/R/mllib_classification.R                  |  2 +-
 R/pkg/R/mllib_clustering.R                      |  2 +-
 R/pkg/R/mllib_regression.R                      |  4 +--
 .../tests/testthat/test_mllib_classification.R  | 17 +++++++-----
 .../inst/tests/testthat/test_mllib_clustering.R |  4 +++
 .../inst/tests/testthat/test_mllib_regression.R | 27 +++++++++++---------
 6 files changed, 34 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/R/mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index fee4a4c..552cbe4 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -145,7 +145,7 @@ setMethod("summary", signature(object = "LogisticRegressionModel"),
             labels <- callJMethod(jobj, "labels")
             coefficients <- callJMethod(jobj, "rCoefficients")
             nCol <- length(coefficients) / length(features)
-            coefficients <- matrix(coefficients, ncol = nCol)
+            coefficients <- matrix(unlist(coefficients), ncol = nCol)
             # If nCol == 1, means this is a binomial logistic regression model with pivoting.
             # Otherwise, it's a multinomial logistic regression model without pivoting.
             if (nCol == 1) {

http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/R/mllib_clustering.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
index e384c73..3b782ce 100644
--- a/R/pkg/R/mllib_clustering.R
+++ b/R/pkg/R/mllib_clustering.R
@@ -390,7 +390,7 @@ setMethod("summary", signature(object = "KMeansModel"),
             coefficients <- callJMethod(jobj, "coefficients")
             k <- callJMethod(jobj, "k")
             size <- callJMethod(jobj, "size")
-            coefficients <- t(matrix(coefficients, ncol = k))
+            coefficients <- t(matrix(unlist(coefficients), ncol = k))
             colnames(coefficients) <- unlist(features)
             rownames(coefficients) <- 1:k
             cluster <- if (is.loaded) {

http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/R/mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index 7908600..96ee220 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -182,11 +182,11 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             # coefficients, standard error of coefficients, t value and p value. Otherwise,
             # it will be fitted by local "l-bfgs", we can only provide coefficients.
             if (length(features) == length(coefficients)) {
-              coefficients <- matrix(coefficients, ncol = 1)
+              coefficients <- matrix(unlist(coefficients), ncol = 1)
               colnames(coefficients) <- c("Estimate")
               rownames(coefficients) <- unlist(features)
             } else {
-              coefficients <- matrix(coefficients, ncol = 4)
+              coefficients <- matrix(unlist(coefficients), ncol = 4)
               colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
               rownames(coefficients) <- unlist(features)
             }

http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/inst/tests/testthat/test_mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R
index 2e0dea3..5f84a62 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_classification.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_classification.R
@@ -68,12 +68,17 @@ test_that("spark.logit", {
   df <- suppressWarnings(createDataFrame(iris))
   model <- spark.logit(df, Species ~ ., regParam = 0.5)
   summary <- summary(model)
+
+  # test summary coefficients return matrix type
+  expect_true(class(summary$coefficients) == "matrix")
+  expect_true(class(summary$coefficients[, 1]) == "numeric")
+
   versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
   virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
   setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
-  versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
-  virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
-  setosaCoefs <- unlist(summary$coefficients[, "setosa"])
+  versicolorCoefs <- summary$coefficients[, "versicolor"]
+  virginicaCoefs <- summary$coefficients[, "virginica"]
+  setosaCoefs <- summary$coefficients[, "setosa"]
   expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
   expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
   expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
@@ -136,8 +141,8 @@ test_that("spark.logit", {
   summary <- summary(model)
   versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
   virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
-  versicolorCoefs <- unlist(summary$coefficients[, "versicolor"])
-  virginicaCoefs <- unlist(summary$coefficients[, "virginica"])
+  versicolorCoefs <- summary$coefficients[, "versicolor"]
+  virginicaCoefs <- summary$coefficients[, "virginica"]
   expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
   expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
 
@@ -145,7 +150,7 @@ test_that("spark.logit", {
   model <- spark.logit(training, Species ~ ., regParam = 0.5)
   summary <- summary(model)
   coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
-  coefs <- unlist(summary$coefficients[, "Estimate"])
+  coefs <- summary$coefficients[, "Estimate"]
   expect_true(all(abs(coefsR - coefs) < 0.1))
 
   # Test prediction with string label

http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/inst/tests/testthat/test_mllib_clustering.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R
index aad834b..28a6eeb 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_clustering.R
@@ -166,6 +166,10 @@ test_that("spark.kmeans", {
   expect_equal(k, 2)
   expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
 
+  # test summary coefficients return matrix type
+  expect_true(class(summary.model$coefficients) == "matrix")
+  expect_true(class(summary.model$coefficients[1, ]) == "numeric")
+
   # Test model save/load
   modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
   write.ml(model, modelPath)

http://git-wip-us.apache.org/repos/asf/spark/blob/ce112cec/R/pkg/inst/tests/testthat/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
index c450a15..81a5bdc 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_regression.R
@@ -87,11 +87,14 @@ test_that("spark.glm summary", {
   # gaussian family
   training <- suppressWarnings(createDataFrame(iris))
   stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
-
   rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
 
-  coefs <- unlist(stats$coefficients)
-  rCoefs <- unlist(rStats$coefficients)
+  # test summary coefficients return matrix type
+  expect_true(class(stats$coefficients) == "matrix")
+  expect_true(class(stats$coefficients[, 1]) == "numeric")
+
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
   expect_true(all(abs(rCoefs - coefs) < 1e-4))
   expect_true(all(
     rownames(stats$coefficients) ==
@@ -117,8 +120,8 @@ test_that("spark.glm summary", {
   rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
                         family = binomial(link = "logit")))
 
-  coefs <- unlist(stats$coefficients)
-  rCoefs <- unlist(rStats$coefficients)
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
   expect_true(all(abs(rCoefs - coefs) < 1e-4))
   expect_true(all(
     rownames(stats$coefficients) ==
@@ -141,8 +144,8 @@ test_that("spark.glm summary", {
   stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
   rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
 
-  coefs <- unlist(stats$coefficients)
-  rCoefs <- unlist(rStats$coefficients)
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
   expect_true(all(abs(rCoefs - coefs) < 1e-3))
   expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
   expect_equal(stats$dispersion, rStats$dispersion)
@@ -169,7 +172,7 @@ test_that("spark.glm summary", {
   data <- as.data.frame(cbind(A, b))
   df <- createDataFrame(data)
   stats <- summary(spark.glm(df, b ~ . - 1))
-  coefs <- unlist(stats$coefficients)
+  coefs <- stats$coefficients
   expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
 })
 
@@ -259,8 +262,8 @@ test_that("glm summary", {
 
   rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
 
-  coefs <- unlist(stats$coefficients)
-  rCoefs <- unlist(rStats$coefficients)
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
   expect_true(all(abs(rCoefs - coefs) < 1e-4))
   expect_true(all(
     rownames(stats$coefficients) ==
@@ -282,8 +285,8 @@ test_that("glm summary", {
   rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
                         family = binomial(link = "logit")))
 
-  coefs <- unlist(stats$coefficients)
-  rCoefs <- unlist(rStats$coefficients)
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
   expect_true(all(abs(rCoefs - coefs) < 1e-4))
   expect_true(all(
     rownames(stats$coefficients) ==


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org