You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yl...@apache.org on 2016/11/23 03:17:58 UTC
spark git commit: [SPARK-18501][ML][SPARKR] Fix spark.glm errors when
fitting on collinear data
Repository: spark
Updated Branches:
refs/heads/master d0212eb0f -> 982b82e32
[SPARK-18501][ML][SPARKR] Fix spark.glm errors when fitting on collinear data
## What changes were proposed in this pull request?
* Fix SparkR ```spark.glm``` errors when fitting on collinear data, since ```standard error of coefficients, t value and p value``` are not available in this condition.
* Scala/Python GLM summary should throw exception if users get ```standard error of coefficients, t value and p value``` but the underlying WLS was solved by local "l-bfgs".
## How was this patch tested?
Add unit tests.
Author: Yanbo Liang <yb...@gmail.com>
Closes #15930 from yanboliang/spark-18501.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/982b82e3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/982b82e3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/982b82e3
Branch: refs/heads/master
Commit: 982b82e32e0fc7d30c5d557944a79eb3e6d2da59
Parents: d0212eb
Author: Yanbo Liang <yb...@gmail.com>
Authored: Tue Nov 22 19:17:48 2016 -0800
Committer: Yanbo Liang <yb...@gmail.com>
Committed: Tue Nov 22 19:17:48 2016 -0800
----------------------------------------------------------------------
R/pkg/R/mllib.R | 21 ++++++--
R/pkg/inst/tests/testthat/test_mllib.R | 9 ++++
.../r/GeneralizedLinearRegressionWrapper.scala | 54 +++++++++++---------
.../GeneralizedLinearRegression.scala | 46 ++++++++++++++---
.../GeneralizedLinearRegressionSuite.scala | 21 ++++++++
5 files changed, 115 insertions(+), 36 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/R/pkg/R/mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 265e64e..02bc645 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -278,8 +278,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
#' @param object a fitted generalized linear model.
#' @return \code{summary} returns a summary object of the fitted model, a list of components
-#' including at least the coefficients, null/residual deviance, null/residual degrees
-#' of freedom, AIC and number of iterations IRLS takes.
+#' including at least the coefficients matrix (which includes coefficients, standard error
+#' of coefficients, t value and p value), null/residual deviance, null/residual degrees of
+#' freedom, AIC and number of iterations IRLS takes. If there are collinear columns
+#' in you data, the coefficients matrix only provides coefficients.
#'
#' @rdname spark.glm
#' @export
@@ -303,9 +305,18 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
} else {
dataFrame(callJMethod(jobj, "rDevianceResiduals"))
}
- coefficients <- matrix(coefficients, ncol = 4)
- colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
- rownames(coefficients) <- unlist(features)
+ # If the underlying WeightedLeastSquares using "normal" solver, we can provide
+ # coefficients, standard error of coefficients, t value and p value. Otherwise,
+ # it will be fitted by local "l-bfgs", we can only provide coefficients.
+ if (length(features) == length(coefficients)) {
+ coefficients <- matrix(coefficients, ncol = 1)
+ colnames(coefficients) <- c("Estimate")
+ rownames(coefficients) <- unlist(features)
+ } else {
+ coefficients <- matrix(coefficients, ncol = 4)
+ colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
+ rownames(coefficients) <- unlist(features)
+ }
ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
dispersion = dispersion, null.deviance = null.deviance,
deviance = deviance, df.null = df.null, df.residual = df.residual,
http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/R/pkg/inst/tests/testthat/test_mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 2a97a51..467e00c 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -169,6 +169,15 @@ test_that("spark.glm summary", {
df <- suppressWarnings(createDataFrame(data))
regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
+
+ # Test spark.glm works on collinear data
+ A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+ b <- c(1, 2, 3, 4)
+ data <- as.data.frame(cbind(A, b))
+ df <- createDataFrame(data)
+ stats <- summary(spark.glm(df, b ~ . - 1))
+ coefs <- unlist(stats$coefficients)
+ expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
})
test_that("spark.glm save/load", {
http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index add4d49..8bcc9fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -144,30 +144,38 @@ private[r] object GeneralizedLinearRegressionWrapper
features
}
- val rCoefficientStandardErrors = if (glm.getFitIntercept) {
- Array(summary.coefficientStandardErrors.last) ++
- summary.coefficientStandardErrors.dropRight(1)
+ val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
+ val rCoefficientStandardErrors = if (glm.getFitIntercept) {
+ Array(summary.coefficientStandardErrors.last) ++
+ summary.coefficientStandardErrors.dropRight(1)
+ } else {
+ summary.coefficientStandardErrors
+ }
+
+ val rTValues = if (glm.getFitIntercept) {
+ Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
+ } else {
+ summary.tValues
+ }
+
+ val rPValues = if (glm.getFitIntercept) {
+ Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
+ } else {
+ summary.pValues
+ }
+
+ if (glm.getFitIntercept) {
+ Array(glm.intercept) ++ glm.coefficients.toArray ++
+ rCoefficientStandardErrors ++ rTValues ++ rPValues
+ } else {
+ glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+ }
} else {
- summary.coefficientStandardErrors
- }
-
- val rTValues = if (glm.getFitIntercept) {
- Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
- } else {
- summary.tValues
- }
-
- val rPValues = if (glm.getFitIntercept) {
- Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
- } else {
- summary.pValues
- }
-
- val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
- Array(glm.intercept) ++ glm.coefficients.toArray ++
- rCoefficientStandardErrors ++ rTValues ++ rPValues
- } else {
- glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+ if (glm.getFitIntercept) {
+ Array(glm.intercept) ++ glm.coefficients.toArray
+ } else {
+ glm.coefficients.toArray
+ }
}
val rDispersion: Double = summary.dispersion
http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 3f9de1f..f33dd0f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1064,44 +1064,74 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
import GeneralizedLinearRegression._
/**
+ * Whether the underlying [[WeightedLeastSquares]] using the "normal" solver.
+ */
+ private[ml] val isNormalSolver: Boolean = {
+ diagInvAtWA.length != 1 || diagInvAtWA(0) != 0
+ }
+
+ /**
* Standard error of estimated coefficients and intercept.
+ * This value is only available when the underlying [[WeightedLeastSquares]]
+ * using the "normal" solver.
*
* If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val coefficientStandardErrors: Array[Double] = {
- diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+ if (isNormalSolver) {
+ diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+ } else {
+ throw new UnsupportedOperationException(
+ "No Std. Error of coefficients available for this GeneralizedLinearRegressionModel")
+ }
}
/**
* T-statistic of estimated coefficients and intercept.
+ * This value is only available when the underlying [[WeightedLeastSquares]]
+ * using the "normal" solver.
*
* If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val tValues: Array[Double] = {
- val estimate = if (model.getFitIntercept) {
- Array.concat(model.coefficients.toArray, Array(model.intercept))
+ if (isNormalSolver) {
+ val estimate = if (model.getFitIntercept) {
+ Array.concat(model.coefficients.toArray, Array(model.intercept))
+ } else {
+ model.coefficients.toArray
+ }
+ estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
} else {
- model.coefficients.toArray
+ throw new UnsupportedOperationException(
+ "No t-statistic available for this GeneralizedLinearRegressionModel")
}
- estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
}
/**
* Two-sided p-value of estimated coefficients and intercept.
+ * This value is only available when the underlying [[WeightedLeastSquares]]
+ * using the "normal" solver.
*
* If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
* then the last element returned corresponds to the intercept.
*/
@Since("2.0.0")
lazy val pValues: Array[Double] = {
- if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
- tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+ if (isNormalSolver) {
+ if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
+ tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+ } else {
+ tValues.map { x =>
+ 2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x)))
+ }
+ }
} else {
- tValues.map { x => 2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) }
+ throw new UnsupportedOperationException(
+ "No p-value available for this GeneralizedLinearRegressionModel")
}
}
}
http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 9b0fa67..4fab216 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1048,6 +1048,27 @@ class GeneralizedLinearRegressionSuite
assert(summary.solver === "irls")
}
+ test("glm handle collinear features") {
+ val collinearInstances = Seq(
+ Instance(1.0, 1.0, Vectors.dense(1.0, 2.0)),
+ Instance(2.0, 1.0, Vectors.dense(2.0, 4.0)),
+ Instance(3.0, 1.0, Vectors.dense(3.0, 6.0)),
+ Instance(4.0, 1.0, Vectors.dense(4.0, 8.0))
+ ).toDF()
+ val trainer = new GeneralizedLinearRegression()
+ val model = trainer.fit(collinearInstances)
+ // to make it clear that underlying WLS did not solve analytically
+ intercept[UnsupportedOperationException] {
+ model.summary.coefficientStandardErrors
+ }
+ intercept[UnsupportedOperationException] {
+ model.summary.pValues
+ }
+ intercept[UnsupportedOperationException] {
+ model.summary.tValues
+ }
+ }
+
test("read/write") {
def checkModelData(
model: GeneralizedLinearRegressionModel,
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org