You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2016/04/15 17:23:55 UTC

spark git commit: [SPARK-13925][ML][SPARKR] Expose R-like summary statistics in SparkR::glm for more family and link functions

Repository: spark
Updated Branches:
  refs/heads/master 06b9d623e -> 83af297ac


[SPARK-13925][ML][SPARKR] Expose R-like summary statistics in SparkR::glm for more family and link functions

## What changes were proposed in this pull request?
Expose R-like summary statistics in SparkR::glm for more family and link functions.
Note: Not all values in R [summary.glm](http://stat.ethz.ch/R-manual/R-patched/library/stats/html/summary.glm.html) are exposed, we only provide the most commonly used statistics in this PR. More statistics can be added in the followup work.

## How was this patch tested?
Unit tests.

SparkR Output:
```
Deviance Residuals:
(Note: These are approximate quantiles with relative error <= 0.01)
     Min        1Q    Median        3Q       Max
-0.95096  -0.16585  -0.00232   0.17410   0.72918

Coefficients:
                    Estimate  Std. Error  t value  Pr(>|t|)
(Intercept)         1.6765    0.23536     7.1231   4.4561e-11
Sepal_Length        0.34988   0.046301    7.5566   4.1873e-12
Species_versicolor  -0.98339  0.072075    -13.644  0
Species_virginica   -1.0075   0.093306    -10.798  0

(Dispersion parameter for gaussian family taken to be 0.08351462)

    Null deviance: 28.307  on 149  degrees of freedom
Residual deviance: 12.193  on 146  degrees of freedom
AIC: 59.22

Number of Fisher Scoring iterations: 1
```
R output:
```
Deviance Residuals:
     Min        1Q    Median        3Q       Max
-0.95096  -0.16522   0.00171   0.18416   0.72918

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)
(Intercept)        1.67650    0.23536   7.123 4.46e-11 ***
Sepal.Length       0.34988    0.04630   7.557 4.19e-12 ***
Speciesversicolor -0.98339    0.07207 -13.644  < 2e-16 ***
Speciesvirginica  -1.00751    0.09331 -10.798  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.08351462)

    Null deviance: 28.307  on 149  degrees of freedom
Residual deviance: 12.193  on 146  degrees of freedom
AIC: 59.217

Number of Fisher Scoring iterations: 2
```

cc mengxr

Author: Yanbo Liang <yb...@gmail.com>

Closes #12393 from yanboliang/spark-13925.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/83af297a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/83af297a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/83af297a

Branch: refs/heads/master
Commit: 83af297ac42546580983f91079f74e3a4cf25050
Parents: 06b9d62
Author: Yanbo Liang <yb...@gmail.com>
Authored: Fri Apr 15 08:23:51 2016 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Fri Apr 15 08:23:51 2016 -0700

----------------------------------------------------------------------
 R/pkg/NAMESPACE                                 |  3 +-
 R/pkg/R/mllib.R                                 | 49 ++++++++++++++++--
 R/pkg/inst/tests/testthat/test_mllib.R          | 49 ++++++++++++++++++
 .../r/GeneralizedLinearRegressionWrapper.scala  | 52 +++++++++++++++++---
 4 files changed, 143 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index f48c61c..94ac7e7 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -292,7 +292,8 @@ export("as.DataFrame",
        "tableToDF",
        "tableNames",
        "tables",
-       "uncacheTable")
+       "uncacheTable",
+       "print.summary.GeneralizedLinearRegressionModel")
 
 export("structField",
        "structField.jobj",

http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/R/pkg/R/mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 31bca16..922a9b1 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -101,12 +101,55 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             jobj <- object@jobj
             features <- callJMethod(jobj, "rFeatures")
             coefficients <- callJMethod(jobj, "rCoefficients")
-            coefficients <- as.matrix(unlist(coefficients))
-            colnames(coefficients) <- c("Estimate")
+            deviance.resid <- callJMethod(jobj, "rDevianceResiduals")
+            dispersion <- callJMethod(jobj, "rDispersion")
+            null.deviance <- callJMethod(jobj, "rNullDeviance")
+            deviance <- callJMethod(jobj, "rDeviance")
+            df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
+            df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
+            aic <- callJMethod(jobj, "rAic")
+            iter <- callJMethod(jobj, "rNumIterations")
+            family <- callJMethod(jobj, "rFamily")
+
+            deviance.resid <- dataFrame(deviance.resid)
+            coefficients <- matrix(coefficients, ncol = 4)
+            colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
             rownames(coefficients) <- unlist(features)
-            return(list(coefficients = coefficients))
+            ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
+                        dispersion = dispersion, null.deviance = null.deviance,
+                        deviance = deviance, df.null = df.null, df.residual = df.residual,
+                        aic = aic, iter = iter, family = family)
+            class(ans) <- "summary.GeneralizedLinearRegressionModel"
+            return(ans)
           })
 
+#' Print the summary of GeneralizedLinearRegressionModel
+#'
+#' @rdname print
+#' @name print.summary.GeneralizedLinearRegressionModel
+#' @export
+print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
+  x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals",
+    c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max"))
+  x$deviance.resid <- zapsmall(x$deviance.resid, 5L)
+  cat("\nDeviance Residuals: \n")
+  cat("(Note: These are approximate quantiles with relative error <= 0.01)\n")
+  print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L)
+
+  cat("\nCoefficients:\n")
+  print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L)
+
+  cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion),
+    ")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"),
+    format(unlist(x[c("null.deviance", "deviance")]), digits = 5L),
+    " on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"),
+    1L, paste, collapse = " "), sep = "")
+  cat("AIC: ", format(x$aic, digits = 4L), "\n\n",
+    "Number of Fisher Scoring iterations: ", x$iter, "\n", sep = "")
+  cat("\n")
+  invisible(x)
+  }
+
 #' Make predictions from a generalized linear model
 #'
 #' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().

http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/R/pkg/inst/tests/testthat/test_mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index a9dbd2b..47bbf7e 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -77,6 +77,55 @@ test_that("glm and predict", {
   expect_equal(length(predict(lm(y ~ x))), 15)
 })
 
+test_that("glm summary", {
+  # gaussian family
+  training <- suppressWarnings(createDataFrame(sqlContext, iris))
+  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+
+  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+
+  coefs <- unlist(stats$coefficients)
+  rCoefs <- unlist(rStats$coefficients)
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
+  expect_true(all(
+    rownames(stats$coefficients) ==
+    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+  expect_equal(stats$dispersion, rStats$dispersion)
+  expect_equal(stats$null.deviance, rStats$null.deviance)
+  expect_equal(stats$deviance, rStats$deviance)
+  expect_equal(stats$df.null, rStats$df.null)
+  expect_equal(stats$df.residual, rStats$df.residual)
+  expect_equal(stats$aic, rStats$aic)
+
+  # binomial family
+  df <- suppressWarnings(createDataFrame(sqlContext, iris))
+  training <- df[df$Species %in% c("versicolor", "virginica"), ]
+  stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
+    family = binomial(link = "logit")))
+
+  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+    family = binomial(link = "logit")))
+
+  coefs <- unlist(stats$coefficients)
+  rCoefs <- unlist(rStats$coefficients)
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
+  expect_true(all(
+    rownames(stats$coefficients) ==
+    c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+  expect_equal(stats$dispersion, rStats$dispersion)
+  expect_equal(stats$null.deviance, rStats$null.deviance)
+  expect_equal(stats$deviance, rStats$deviance)
+  expect_equal(stats$df.null, rStats$df.null)
+  expect_equal(stats$df.residual, rStats$df.residual)
+  expect_equal(stats$aic, rStats$aic)
+
+  # Test summary works on base GLM models
+  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+  baseSummary <- summary(baseModel)
+  expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+})
+
 test_that("kmeans", {
   newIris <- iris
   newIris$Species <- NULL

http://git-wip-us.apache.org/repos/asf/spark/blob/83af297a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index 475a308..f66323e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -30,19 +30,59 @@ private[r] class GeneralizedLinearRegressionWrapper private (
   private val glm: GeneralizedLinearRegressionModel =
     pipeline.stages(1).asInstanceOf[GeneralizedLinearRegressionModel]
 
+  lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
+    Array("(Intercept)") ++ features
+  } else {
+    features
+  }
+
   lazy val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
-    Array(glm.intercept) ++ glm.coefficients.toArray
+    Array(glm.intercept) ++ glm.coefficients.toArray ++
+      rCoefficientStandardErrors ++ rTValues ++ rPValues
   } else {
-    glm.coefficients.toArray
+    glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
   }
 
-  lazy val rFeatures: Array[String] = if (glm.getFitIntercept) {
-    Array("(Intercept)") ++ features
+  private lazy val rCoefficientStandardErrors = if (glm.getFitIntercept) {
+    Array(glm.summary.coefficientStandardErrors.last) ++
+      glm.summary.coefficientStandardErrors.dropRight(1)
   } else {
-    features
+    glm.summary.coefficientStandardErrors
+  }
+
+  private lazy val rTValues = if (glm.getFitIntercept) {
+    Array(glm.summary.tValues.last) ++ glm.summary.tValues.dropRight(1)
+  } else {
+    glm.summary.tValues
   }
 
-  def transform(dataset: DataFrame): DataFrame = {
+  private lazy val rPValues = if (glm.getFitIntercept) {
+    Array(glm.summary.pValues.last) ++ glm.summary.pValues.dropRight(1)
+  } else {
+    glm.summary.pValues
+  }
+
+  lazy val rDispersion: Double = glm.summary.dispersion
+
+  lazy val rNullDeviance: Double = glm.summary.nullDeviance
+
+  lazy val rDeviance: Double = glm.summary.deviance
+
+  lazy val rResidualDegreeOfFreedomNull: Long = glm.summary.residualDegreeOfFreedomNull
+
+  lazy val rResidualDegreeOfFreedom: Long = glm.summary.residualDegreeOfFreedom
+
+  lazy val rAic: Double = glm.summary.aic
+
+  lazy val rNumIterations: Int = glm.summary.numIterations
+
+  lazy val rDevianceResiduals: DataFrame = glm.summary.residuals()
+
+  lazy val rFamily: String = glm.getFamily
+
+  def residuals(residualsType: String): DataFrame = glm.summary.residuals(residualsType)
+
+  def transform(dataset: Dataset[_]): DataFrame = {
     pipeline.transform(dataset).drop(glm.getFeaturesCol)
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org