You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2016/12/13 21:27:34 UTC
spark git commit: [SPARK-18715][ML] Fix AIC calculations in Binomial
GLM
Repository: spark
Updated Branches:
refs/heads/master 43298d157 -> e57e3938c
[SPARK-18715][ML] Fix AIC calculations in Binomial GLM
The AIC calculation in Binomial GLM seems to be off when the response or weight is non-integer: the result is different from that in R. This issue arises when one models rates, i.e, num of successes normalized over num of trials, and uses num of trials as weights. In this case, the effective likelihood is weight * label ~ binomial(weight, mu), where weight = number of trials, and weight * label = number of successes and mu = is the success rate.
srowen sethah yanboliang HyukjinKwon zhengruifeng
## What changes were proposed in this pull request?
I suggest changing the current aic calculation for the Binomial family from
```
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
weight * dist.Binomial(1, mu).logProbabilityOf(math.round(y).toInt)
}.sum()
```
to the following which generalizes to the case of real-valued response and weights.
```
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
val wt = math.round(weight).toInt
if (wt == 0){
0.0
} else {
dist.Binomial(wt, mu).logProbabilityOf(math.round(y * weight).toInt)
}
}.sum()
```
## How was this patch tested?
I will write the unit test once the community wants to include the proposed change. For now, the following modifies existing tests in weighted Binomial GLM to illustrate the issue. The second label is changed from 0 to 0.5.
```
val datasetWithWeight = Seq(
(1.0, 1.0, 0.0, 5.0),
(0.5, 2.0, 1.0, 2.0),
(1.0, 3.0, 2.0, 1.0),
(0.0, 4.0, 3.0, 3.0)
).toDF("y", "w", "x1", "x2")
val formula = (new RFormula()
.setFormula("y ~ x1 + x2")
.setFeaturesCol("features")
.setLabelCol("label"))
val output = formula.fit(datasetWithWeight).transform(datasetWithWeight).select("features", "label", "w")
val glr = new GeneralizedLinearRegression()
.setFamily("binomial")
.setWeightCol("w")
.setFitIntercept(false)
.setRegParam(0)
val model = glr.fit(output)
model.summary.aic
```
The AIC from Spark is 17.3227, and the AIC from R is 15.66454.
Author: actuaryzhang <ac...@gmail.com>
Closes #16149 from actuaryzhang/aic.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e57e3938
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e57e3938
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e57e3938
Branch: refs/heads/master
Commit: e57e3938c69fb1d91970341f027f2ab5000d2daa
Parents: 43298d1
Author: actuaryzhang <ac...@gmail.com>
Authored: Tue Dec 13 21:27:29 2016 +0000
Committer: Sean Owen <so...@cloudera.com>
Committed: Tue Dec 13 21:27:29 2016 +0000
----------------------------------------------------------------------
.../GeneralizedLinearRegression.scala | 18 +++++--
.../GeneralizedLinearRegressionSuite.scala | 57 ++++++++++----------
2 files changed, 43 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/e57e3938/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index f137c8c..3891ae6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -215,6 +215,8 @@ class GeneralizedLinearRegression @Since("2.0.0") (@Since("2.0.0") override val
* Sets the value of param [[weightCol]].
* If this is not set or empty, we treat all instance weights as 1.0.
* Default is not set, so all instances have weight one.
+ * In the Binomial family, weights correspond to number of trials and should be integer.
+ * Non-integer weights are rounded to integer in AIC calculation.
*
* @group setParam
*/
@@ -467,10 +469,12 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
override def variance(mu: Double): Double = mu * (1.0 - mu)
+ private def ylogy(y: Double, mu: Double): Double = {
+ if (y == 0) 0.0 else y * math.log(y / mu)
+ }
+
override def deviance(y: Double, mu: Double, weight: Double): Double = {
- val my = 1.0 - y
- 2.0 * weight * (y * math.log(math.max(y, 1.0) / mu) +
- my * math.log(math.max(my, 1.0) / (1.0 - mu)))
+ 2.0 * weight * (ylogy(y, mu) + ylogy(1.0 - y, 1.0 - mu))
}
override def aic(
@@ -479,7 +483,13 @@ object GeneralizedLinearRegression extends DefaultParamsReadable[GeneralizedLine
numInstances: Double,
weightSum: Double): Double = {
-2.0 * predictions.map { case (y: Double, mu: Double, weight: Double) =>
- weight * dist.Binomial(1, mu).logProbabilityOf(math.round(y).toInt)
+ // weights for Binomial distribution correspond to number of trials
+ val wt = math.round(weight).toInt
+ if (wt == 0) {
+ 0.0
+ } else {
+ dist.Binomial(wt, mu).logProbabilityOf(math.round(y * weight).toInt)
+ }
}.sum()
}
http://git-wip-us.apache.org/repos/asf/spark/blob/e57e3938/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 3e9e1fc..ed24c1e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -711,16 +711,17 @@ class GeneralizedLinearRegressionSuite
R code:
A <- matrix(c(0, 1, 2, 3, 5, 2, 1, 3), 4, 2)
- b <- c(1, 0, 1, 0)
- w <- c(1, 2, 3, 4)
+ b <- c(1, 0.5, 1, 0)
+ w <- c(1, 2.0, 0.3, 4.7)
df <- as.data.frame(cbind(A, b))
*/
val datasetWithWeight = Seq(
Instance(1.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
- Instance(0.0, 2.0, Vectors.dense(1.0, 2.0)),
- Instance(1.0, 3.0, Vectors.dense(2.0, 1.0)),
- Instance(0.0, 4.0, Vectors.dense(3.0, 3.0))
+ Instance(0.5, 2.0, Vectors.dense(1.0, 2.0)),
+ Instance(1.0, 0.3, Vectors.dense(2.0, 1.0)),
+ Instance(0.0, 4.7, Vectors.dense(3.0, 3.0))
).toDF()
+
/*
R code:
@@ -728,34 +729,34 @@ class GeneralizedLinearRegressionSuite
summary(model)
Deviance Residuals:
- 1 2 3 4
- 1.273 -1.437 2.533 -1.556
+ 1 2 3 4
+ 0.2404 0.1965 1.2824 -0.6916
Coefficients:
Estimate Std. Error z value Pr(>|z|)
- V1 -0.30217 0.46242 -0.653 0.513
- V2 -0.04452 0.37124 -0.120 0.905
+ x1 -1.6901 1.2764 -1.324 0.185
+ x2 0.7059 0.9449 0.747 0.455
(Dispersion parameter for binomial family taken to be 1)
- Null deviance: 13.863 on 4 degrees of freedom
- Residual deviance: 12.524 on 2 degrees of freedom
- AIC: 16.524
+ Null deviance: 8.3178 on 4 degrees of freedom
+ Residual deviance: 2.2193 on 2 degrees of freedom
+ AIC: 5.9915
Number of Fisher Scoring iterations: 5
residuals(model, type="pearson")
1 2 3 4
- 1.117731 -1.162962 2.395838 -1.189005
+ 0.171217 0.197406 2.085864 -0.495332
residuals(model, type="working")
1 2 3 4
- 2.249324 -1.676240 2.913346 -1.353433
+ 1.029315 0.281881 15.502768 -1.052203
residuals(model, type="response")
- 1 2 3 4
- 0.5554219 -0.4034267 0.6567520 -0.2611382
- */
+ 1 2 3 4
+ 0.028480 0.069123 0.935495 -0.049613
+ */
val trainer = new GeneralizedLinearRegression()
.setFamily("binomial")
.setWeightCol("weight")
@@ -763,21 +764,21 @@ class GeneralizedLinearRegressionSuite
val model = trainer.fit(datasetWithWeight)
- val coefficientsR = Vectors.dense(Array(-0.30217, -0.04452))
+ val coefficientsR = Vectors.dense(Array(-1.690134, 0.705929))
val interceptR = 0.0
- val devianceResidualsR = Array(1.273, -1.437, 2.533, -1.556)
- val pearsonResidualsR = Array(1.117731, -1.162962, 2.395838, -1.189005)
- val workingResidualsR = Array(2.249324, -1.676240, 2.913346, -1.353433)
- val responseResidualsR = Array(0.5554219, -0.4034267, 0.6567520, -0.2611382)
- val seCoefR = Array(0.46242, 0.37124)
- val tValsR = Array(-0.653, -0.120)
- val pValsR = Array(0.513, 0.905)
+ val devianceResidualsR = Array(0.2404, 0.1965, 1.2824, -0.6916)
+ val pearsonResidualsR = Array(0.171217, 0.197406, 2.085864, -0.495332)
+ val workingResidualsR = Array(1.029315, 0.281881, 15.502768, -1.052203)
+ val responseResidualsR = Array(0.02848, 0.069123, 0.935495, -0.049613)
+ val seCoefR = Array(1.276417, 0.944934)
+ val tValsR = Array(-1.324124, 0.747068)
+ val pValsR = Array(0.185462, 0.455023)
val dispersionR = 1.0
- val nullDevianceR = 13.863
- val residualDevianceR = 12.524
+ val nullDevianceR = 8.3178
+ val residualDevianceR = 2.2193
val residualDegreeOfFreedomNullR = 4
val residualDegreeOfFreedomR = 2
- val aicR = 16.524
+ val aicR = 5.991537
val summary = model.summary
val devianceResiduals = summary.residuals()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org