You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yl...@apache.org on 2016/11/23 03:17:58 UTC

spark git commit: [SPARK-18501][ML][SPARKR] Fix spark.glm errors when fitting on collinear data

Repository: spark
Updated Branches:
  refs/heads/master d0212eb0f -> 982b82e32


[SPARK-18501][ML][SPARKR] Fix spark.glm errors when fitting on collinear data

## What changes were proposed in this pull request?
* Fix SparkR ```spark.glm``` errors when fitting on collinear data, since ```standard error of coefficients, t value and p value``` are not available in this condition.
* Scala/Python GLM summary should throw exception if users get ```standard error of coefficients, t value and p value``` but the underlying WLS was solved by local "l-bfgs".

## How was this patch tested?
Add unit tests.

Author: Yanbo Liang <yb...@gmail.com>

Closes #15930 from yanboliang/spark-18501.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/982b82e3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/982b82e3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/982b82e3

Branch: refs/heads/master
Commit: 982b82e32e0fc7d30c5d557944a79eb3e6d2da59
Parents: d0212eb
Author: Yanbo Liang <yb...@gmail.com>
Authored: Tue Nov 22 19:17:48 2016 -0800
Committer: Yanbo Liang <yb...@gmail.com>
Committed: Tue Nov 22 19:17:48 2016 -0800

----------------------------------------------------------------------
 R/pkg/R/mllib.R                                 | 21 ++++++--
 R/pkg/inst/tests/testthat/test_mllib.R          |  9 ++++
 .../r/GeneralizedLinearRegressionWrapper.scala  | 54 +++++++++++---------
 .../GeneralizedLinearRegression.scala           | 46 ++++++++++++++---
 .../GeneralizedLinearRegressionSuite.scala      | 21 ++++++++
 5 files changed, 115 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/R/pkg/R/mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index 265e64e..02bc645 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -278,8 +278,10 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
 
 #' @param object a fitted generalized linear model.
 #' @return \code{summary} returns a summary object of the fitted model, a list of components
-#'         including at least the coefficients, null/residual deviance, null/residual degrees
-#'         of freedom, AIC and number of iterations IRLS takes.
+#'         including at least the coefficients matrix (which includes coefficients, standard error
+#'         of coefficients, t value and p value), null/residual deviance, null/residual degrees of
+#'         freedom, AIC and number of iterations IRLS takes. If there are collinear columns
+#'         in you data, the coefficients matrix only provides coefficients.
 #'
 #' @rdname spark.glm
 #' @export
@@ -303,9 +305,18 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
             } else {
               dataFrame(callJMethod(jobj, "rDevianceResiduals"))
             }
-            coefficients <- matrix(coefficients, ncol = 4)
-            colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
-            rownames(coefficients) <- unlist(features)
+            # If the underlying WeightedLeastSquares using "normal" solver, we can provide
+            # coefficients, standard error of coefficients, t value and p value. Otherwise,
+            # it will be fitted by local "l-bfgs", we can only provide coefficients.
+            if (length(features) == length(coefficients)) {
+              coefficients <- matrix(coefficients, ncol = 1)
+              colnames(coefficients) <- c("Estimate")
+              rownames(coefficients) <- unlist(features)
+            } else {
+              coefficients <- matrix(coefficients, ncol = 4)
+              colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
+              rownames(coefficients) <- unlist(features)
+            }
             ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
                         dispersion = dispersion, null.deviance = null.deviance,
                         deviance = deviance, df.null = df.null, df.residual = df.residual,

http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/R/pkg/inst/tests/testthat/test_mllib.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/testthat/test_mllib.R
index 2a97a51..467e00c 100644
--- a/R/pkg/inst/tests/testthat/test_mllib.R
+++ b/R/pkg/inst/tests/testthat/test_mllib.R
@@ -169,6 +169,15 @@ test_that("spark.glm summary", {
   df <- suppressWarnings(createDataFrame(data))
   regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
   expect_equal(regStats$aic, 14.00976, tolerance = 1e-4) # 14.00976 is from summary() result
+
+  # Test spark.glm works on collinear data
+  A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+  b <- c(1, 2, 3, 4)
+  data <- as.data.frame(cbind(A, b))
+  df <- createDataFrame(data)
+  stats <- summary(spark.glm(df, b ~ . - 1))
+  coefs <- unlist(stats$coefficients)
+  expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
 })
 
 test_that("spark.glm save/load", {

http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index add4d49..8bcc9fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -144,30 +144,38 @@ private[r] object GeneralizedLinearRegressionWrapper
       features
     }
 
-    val rCoefficientStandardErrors = if (glm.getFitIntercept) {
-      Array(summary.coefficientStandardErrors.last) ++
-        summary.coefficientStandardErrors.dropRight(1)
+    val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
+      val rCoefficientStandardErrors = if (glm.getFitIntercept) {
+        Array(summary.coefficientStandardErrors.last) ++
+          summary.coefficientStandardErrors.dropRight(1)
+      } else {
+        summary.coefficientStandardErrors
+      }
+
+      val rTValues = if (glm.getFitIntercept) {
+        Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
+      } else {
+        summary.tValues
+      }
+
+      val rPValues = if (glm.getFitIntercept) {
+        Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
+      } else {
+        summary.pValues
+      }
+
+      if (glm.getFitIntercept) {
+        Array(glm.intercept) ++ glm.coefficients.toArray ++
+          rCoefficientStandardErrors ++ rTValues ++ rPValues
+      } else {
+        glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+      }
     } else {
-      summary.coefficientStandardErrors
-    }
-
-    val rTValues = if (glm.getFitIntercept) {
-      Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
-    } else {
-      summary.tValues
-    }
-
-    val rPValues = if (glm.getFitIntercept) {
-      Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
-    } else {
-      summary.pValues
-    }
-
-    val rCoefficients: Array[Double] = if (glm.getFitIntercept) {
-      Array(glm.intercept) ++ glm.coefficients.toArray ++
-        rCoefficientStandardErrors ++ rTValues ++ rPValues
-    } else {
-      glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
+      if (glm.getFitIntercept) {
+        Array(glm.intercept) ++ glm.coefficients.toArray
+      } else {
+        glm.coefficients.toArray
+      }
     }
 
     val rDispersion: Double = summary.dispersion

http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 3f9de1f..f33dd0f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1064,44 +1064,74 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
   import GeneralizedLinearRegression._
 
   /**
+   * Whether the underlying [[WeightedLeastSquares]] using the "normal" solver.
+   */
+  private[ml] val isNormalSolver: Boolean = {
+    diagInvAtWA.length != 1 || diagInvAtWA(0) != 0
+  }
+
+  /**
    * Standard error of estimated coefficients and intercept.
+   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * using the "normal" solver.
    *
    * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val coefficientStandardErrors: Array[Double] = {
-    diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+    if (isNormalSolver) {
+      diagInvAtWA.map(_ * dispersion).map(math.sqrt)
+    } else {
+      throw new UnsupportedOperationException(
+        "No Std. Error of coefficients available for this GeneralizedLinearRegressionModel")
+    }
   }
 
   /**
    * T-statistic of estimated coefficients and intercept.
+   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * using the "normal" solver.
    *
    * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val tValues: Array[Double] = {
-    val estimate = if (model.getFitIntercept) {
-      Array.concat(model.coefficients.toArray, Array(model.intercept))
+    if (isNormalSolver) {
+      val estimate = if (model.getFitIntercept) {
+        Array.concat(model.coefficients.toArray, Array(model.intercept))
+      } else {
+        model.coefficients.toArray
+      }
+      estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
     } else {
-      model.coefficients.toArray
+      throw new UnsupportedOperationException(
+        "No t-statistic available for this GeneralizedLinearRegressionModel")
     }
-    estimate.zip(coefficientStandardErrors).map { x => x._1 / x._2 }
   }
 
   /**
    * Two-sided p-value of estimated coefficients and intercept.
+   * This value is only available when the underlying [[WeightedLeastSquares]]
+   * using the "normal" solver.
    *
    * If [[GeneralizedLinearRegression.fitIntercept]] is set to true,
    * then the last element returned corresponds to the intercept.
    */
   @Since("2.0.0")
   lazy val pValues: Array[Double] = {
-    if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
-      tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+    if (isNormalSolver) {
+      if (model.getFamily == Binomial.name || model.getFamily == Poisson.name) {
+        tValues.map { x => 2.0 * (1.0 - dist.Gaussian(0.0, 1.0).cdf(math.abs(x))) }
+      } else {
+        tValues.map { x =>
+          2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x)))
+        }
+      }
     } else {
-      tValues.map { x => 2.0 * (1.0 - dist.StudentsT(degreesOfFreedom.toDouble).cdf(math.abs(x))) }
+      throw new UnsupportedOperationException(
+        "No p-value available for this GeneralizedLinearRegressionModel")
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/982b82e3/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 9b0fa67..4fab216 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -1048,6 +1048,27 @@ class GeneralizedLinearRegressionSuite
     assert(summary.solver === "irls")
   }
 
+  test("glm handle collinear features") {
+    val collinearInstances = Seq(
+      Instance(1.0, 1.0, Vectors.dense(1.0, 2.0)),
+      Instance(2.0, 1.0, Vectors.dense(2.0, 4.0)),
+      Instance(3.0, 1.0, Vectors.dense(3.0, 6.0)),
+      Instance(4.0, 1.0, Vectors.dense(4.0, 8.0))
+    ).toDF()
+    val trainer = new GeneralizedLinearRegression()
+    val model = trainer.fit(collinearInstances)
+    // to make it clear that underlying WLS did not solve analytically
+    intercept[UnsupportedOperationException] {
+      model.summary.coefficientStandardErrors
+    }
+    intercept[UnsupportedOperationException] {
+      model.summary.pValues
+    }
+    intercept[UnsupportedOperationException] {
+      model.summary.tValues
+    }
+  }
+
   test("read/write") {
     def checkModelData(
         model: GeneralizedLinearRegressionModel,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org