You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yl...@apache.org on 2017/07/27 14:01:11 UTC
spark git commit: [SPARK-19270][ML] Add summary table to GLM summary
Repository: spark
Updated Branches:
refs/heads/master 2ff35a057 -> ddcd2e826
[SPARK-19270][ML] Add summary table to GLM summary
## What changes were proposed in this pull request?
Add R-like summary table to GLM summary, which includes feature name (if exist), parameter estimate, standard error, t-stat and p-value. This allows scala users to easily gather these commonly used inference results.
srowen yanboliang felixcheung
## How was this patch tested?
New tests. One for testing feature Name, and one for testing the summary Table.
Author: actuaryzhang <ac...@gmail.com>
Author: Wayne Zhang <ac...@gmail.com>
Author: Yanbo Liang <yb...@gmail.com>
Closes #16630 from actuaryzhang/glmTable.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ddcd2e82
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ddcd2e82
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ddcd2e82
Branch: refs/heads/master
Commit: ddcd2e8269db36e4b43edd5cee921d4b12def203
Parents: 2ff35a0
Author: actuaryzhang <ac...@gmail.com>
Authored: Thu Jul 27 22:00:59 2017 +0800
Committer: Yanbo Liang <yb...@gmail.com>
Committed: Thu Jul 27 22:00:59 2017 +0800
----------------------------------------------------------------------
.../r/GeneralizedLinearRegressionWrapper.scala | 39 ++-----
.../GeneralizedLinearRegression.scala | 111 ++++++++++++++++++-
.../GeneralizedLinearRegressionSuite.scala | 83 +++++++++++++-
3 files changed, 199 insertions(+), 34 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/ddcd2e82/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
index ee1fc9b..176a6cf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala
@@ -83,11 +83,7 @@ private[r] object GeneralizedLinearRegressionWrapper
.setStringIndexerOrderType(stringIndexerOrderType)
checkDataColumns(rFormula, data)
val rFormulaModel = rFormula.fit(data)
- // get labels and feature names from output schema
- val schema = rFormulaModel.transform(data).schema
- val featureAttrs = AttributeGroup.fromStructField(schema(rFormula.getFeaturesCol))
- .attributes.get
- val features = featureAttrs.map(_.name.get)
+
// assemble and fit the pipeline
val glr = new GeneralizedLinearRegression()
.setFamily(family)
@@ -113,37 +109,16 @@ private[r] object GeneralizedLinearRegressionWrapper
val summary = glm.summary
val rFeatures: Array[String] = if (glm.getFitIntercept) {
- Array("(Intercept)") ++ features
+ Array("(Intercept)") ++ summary.featureNames
} else {
- features
+ summary.featureNames
}
val rCoefficients: Array[Double] = if (summary.isNormalSolver) {
- val rCoefficientStandardErrors = if (glm.getFitIntercept) {
- Array(summary.coefficientStandardErrors.last) ++
- summary.coefficientStandardErrors.dropRight(1)
- } else {
- summary.coefficientStandardErrors
- }
-
- val rTValues = if (glm.getFitIntercept) {
- Array(summary.tValues.last) ++ summary.tValues.dropRight(1)
- } else {
- summary.tValues
- }
-
- val rPValues = if (glm.getFitIntercept) {
- Array(summary.pValues.last) ++ summary.pValues.dropRight(1)
- } else {
- summary.pValues
- }
-
- if (glm.getFitIntercept) {
- Array(glm.intercept) ++ glm.coefficients.toArray ++
- rCoefficientStandardErrors ++ rTValues ++ rPValues
- } else {
- glm.coefficients.toArray ++ rCoefficientStandardErrors ++ rTValues ++ rPValues
- }
+ summary.coefficientsWithStatistics.map(_._2) ++
+ summary.coefficientsWithStatistics.map(_._3) ++
+ summary.coefficientsWithStatistics.map(_._4) ++
+ summary.coefficientsWithStatistics.map(_._5)
} else {
if (glm.getFitIntercept) {
Array(glm.intercept) ++ glm.coefficients.toArray
http://git-wip-us.apache.org/repos/asf/spark/blob/ddcd2e82/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 815607f..917a4d2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -20,12 +20,14 @@ package org.apache.spark.ml.regression
import java.util.Locale
import breeze.stats.{distributions => dist}
+import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkException
import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.PredictorParams
+import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.feature.{Instance, OffsetInstance}
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim._
@@ -37,7 +39,6 @@ import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
-
/**
* Params for Generalized Linear Regression.
*/
@@ -141,6 +142,7 @@ private[regression] trait GeneralizedLinearRegressionBase extends PredictorParam
/**
* Param for offset column name. If this is not set or empty, we treat all instance offsets
* as 0.0. The feature specified as offset has a constant coefficient of 1.0.
+ *
* @group param
*/
@Since("2.3.0")
@@ -1204,6 +1206,21 @@ class GeneralizedLinearRegressionSummary private[regression] (
@Since("2.2.0")
lazy val numInstances: Long = predictions.count()
+
+ /**
+ * Name of features. If the name cannot be retrieved from attributes,
+ * set default names to feature column name with numbered suffix "_0", "_1", and so on.
+ */
+ private[ml] lazy val featureNames: Array[String] = {
+ val featureAttrs = AttributeGroup.fromStructField(
+ dataset.schema(model.getFeaturesCol)).attributes
+ if (featureAttrs.isDefined) {
+ featureAttrs.get.map(_.name.get)
+ } else {
+ Array.tabulate[String](origModel.numFeatures)((x: Int) => model.getFeaturesCol + "_" + x)
+ }
+ }
+
/** The numeric rank of the fitted linear model. */
@Since("2.0.0")
lazy val rank: Long = if (model.getFitIntercept) {
@@ -1458,4 +1475,96 @@ class GeneralizedLinearRegressionTrainingSummary private[regression] (
"No p-value available for this GeneralizedLinearRegressionModel")
}
}
+
+ /**
+ * Coefficients with statistics: feature name, coefficients, standard error, tValue and pValue.
+ */
+ private[ml] lazy val coefficientsWithStatistics: Array[
+ (String, Double, Double, Double, Double)] = {
+ var featureNamesLocal = featureNames
+ var coefficientsArray = model.coefficients.toArray
+ var index = Array.range(0, coefficientsArray.length)
+ if (model.getFitIntercept) {
+ featureNamesLocal = featureNamesLocal :+ "(Intercept)"
+ coefficientsArray = coefficientsArray :+ model.intercept
+ // Reorder so that intercept comes first
+ index = (coefficientsArray.length - 1) +: index
+ }
+ index.map { i =>
+ (featureNamesLocal(i), coefficientsArray(i), coefficientStandardErrors(i),
+ tValues(i), pValues(i))
+ }
+ }
+
+ override def toString: String = {
+ if (isNormalSolver) {
+
+ def round(x: Double): String = {
+ BigDecimal(x).setScale(4, BigDecimal.RoundingMode.HALF_UP).toString
+ }
+
+ val colNames = Array("Feature", "Estimate", "Std Error", "T Value", "P Value")
+
+ val data = coefficientsWithStatistics.map { row =>
+ val strRow = row.productIterator.map { cell =>
+ val str = cell match {
+ case s: String => s
+ case n: Double => round(n)
+ }
+ // Truncate if length > 20
+ if (str.length > 20) {
+ str.substring(0, 17) + "..."
+ } else {
+ str
+ }
+ }
+ strRow.toArray
+ }
+
+ // Compute the width of each column
+ val colWidths = colNames.map(_.length)
+ data.foreach { strRow =>
+ strRow.zipWithIndex.foreach { case (cell: String, i: Int) =>
+ colWidths(i) = math.max(colWidths(i), cell.length)
+ }
+ }
+
+ val sb = new StringBuilder
+
+ // Output coefficients with statistics
+ sb.append("Coefficients:\n")
+ colNames.zipWithIndex.map { case (colName: String, i: Int) =>
+ StringUtils.leftPad(colName, colWidths(i))
+ }.addString(sb, "", " ", "\n")
+
+ data.foreach { case strRow: Array[String] =>
+ strRow.zipWithIndex.map { case (cell: String, i: Int) =>
+ StringUtils.leftPad(cell.toString, colWidths(i))
+ }.addString(sb, "", " ", "\n")
+ }
+
+ sb.append("\n")
+ sb.append(s"(Dispersion parameter for ${family.name} family taken to be " +
+ s"${round(dispersion)})")
+
+ sb.append("\n")
+ val nd = s"Null deviance: ${round(nullDeviance)} on $degreesOfFreedom degrees of freedom"
+ val rd = s"Residual deviance: ${round(deviance)} on $residualDegreeOfFreedom degrees of " +
+ "freedom"
+ val l = math.max(nd.length, rd.length)
+ sb.append(StringUtils.leftPad(nd, l))
+ sb.append("\n")
+ sb.append(StringUtils.leftPad(rd, l))
+
+ if (family.name != "tweedie") {
+ sb.append("\n")
+ sb.append(s"AIC: " + round(aic))
+ }
+
+ sb.toString()
+ } else {
+ throw new UnsupportedOperationException(
+ "No summary available for this GeneralizedLinearRegressionModel")
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/spark/blob/ddcd2e82/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index a47bd17..df7dee8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -22,7 +22,7 @@ import scala.util.Random
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.classification.LogisticRegressionSuite._
import org.apache.spark.ml.feature.{Instance, OffsetInstance}
-import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.feature.{LabeledPoint, RFormula}
import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
@@ -1524,6 +1524,87 @@ class GeneralizedLinearRegressionSuite
.fit(datasetGaussianIdentity.as[LabeledPoint])
}
+ test("glm summary: feature name") {
+ // dataset1 with no attribute
+ val dataset1 = Seq(
+ Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)),
+ Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
+ Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
+ Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),
+ Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
+ ).toDF()
+
+ // dataset2 with attribute
+ val datasetTmp = Seq(
+ (2.0, 1.0, 0.0, 5.0),
+ (8.0, 2.0, 1.0, 7.0),
+ (3.0, 3.0, 2.0, 11.0),
+ (9.0, 4.0, 3.0, 13.0),
+ (2.0, 5.0, 2.0, 3.0)
+ ).toDF("y", "w", "x1", "x2")
+ val formula = new RFormula().setFormula("y ~ x1 + x2")
+ val dataset2 = formula.fit(datasetTmp).transform(datasetTmp)
+
+ val expectedFeature = Seq(Array("features_0", "features_1"), Array("x1", "x2"))
+
+ var idx = 0
+ for (dataset <- Seq(dataset1, dataset2)) {
+ val model = new GeneralizedLinearRegression().fit(dataset)
+ model.summary.featureNames.zip(expectedFeature(idx))
+ .foreach{ x => assert(x._1 === x._2) }
+ idx += 1
+ }
+ }
+
+ test("glm summary: coefficient with statistics") {
+ /*
+ R code:
+
+ A <- matrix(c(0, 1, 2, 3, 2, 5, 7, 11, 13, 3), 5, 2)
+ b <- c(2, 8, 3, 9, 2)
+ df <- as.data.frame(cbind(A, b))
+ model <- glm(formula = "b ~ .", data = df)
+ summary(model)
+
+ Coefficients:
+ Estimate Std. Error t value Pr(>|t|)
+ (Intercept) 0.7903 4.0129 0.197 0.862
+ V1 0.2258 2.1153 0.107 0.925
+ V2 0.4677 0.5815 0.804 0.506
+ */
+ val dataset = Seq(
+ Instance(2.0, 1.0, Vectors.dense(0.0, 5.0)),
+ Instance(8.0, 2.0, Vectors.dense(1.0, 7.0)),
+ Instance(3.0, 3.0, Vectors.dense(2.0, 11.0)),
+ Instance(9.0, 4.0, Vectors.dense(3.0, 13.0)),
+ Instance(2.0, 5.0, Vectors.dense(2.0, 3.0))
+ ).toDF()
+
+ val expectedFeature = Seq(Array("features_0", "features_1"),
+ Array("(Intercept)", "features_0", "features_1"))
+ val expectedEstimate = Seq(Vectors.dense(0.2884, 0.538),
+ Vectors.dense(0.7903, 0.2258, 0.4677))
+ val expectedStdError = Seq(Vectors.dense(1.724, 0.3787),
+ Vectors.dense(4.0129, 2.1153, 0.5815))
+
+ var idx = 0
+ for (fitIntercept <- Seq(false, true)) {
+ val trainer = new GeneralizedLinearRegression()
+ .setFamily("gaussian")
+ .setFitIntercept(fitIntercept)
+ val model = trainer.fit(dataset)
+ val coefficientsWithStatistics = model.summary.coefficientsWithStatistics
+
+ coefficientsWithStatistics.map(_._1).zip(expectedFeature(idx)).foreach { x =>
+ assert(x._1 === x._2, "Feature name mismatch in coefficientsWithStatistics") }
+ assert(Vectors.dense(coefficientsWithStatistics.map(_._2)) ~= expectedEstimate(idx)
+ absTol 1E-3, "Coefficients mismatch in coefficientsWithStatistics")
+ assert(Vectors.dense(coefficientsWithStatistics.map(_._3)) ~= expectedStdError(idx)
+ absTol 1E-3, "Standard error mismatch in coefficientsWithStatistics")
+ idx += 1
+ }
+ }
+
test("generalized linear regression: regularization parameter") {
/*
R code:
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org