You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by fe...@apache.org on 2017/06/22 03:42:49 UTC
spark git commit: [SPARK-20906][SPARKR] Constrained Logistic
Regression for SparkR
Repository: spark
Updated Branches:
refs/heads/master 215281d88 -> 53543374c
[SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR
## What changes were proposed in this pull request?
PR https://github.com/apache/spark/pull/17715 Added Constrained Logistic Regression for ML. We should add it to SparkR.
## How was this patch tested?
Add new unit tests.
Author: wangmiao1981 <wm...@hotmail.com>
Closes #18128 from wangmiao1981/test.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/53543374
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/53543374
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/53543374
Branch: refs/heads/master
Commit: 53543374ce0cf0cec26de2382fbc85b7d5c7e9d6
Parents: 215281d
Author: wangmiao1981 <wm...@hotmail.com>
Authored: Wed Jun 21 20:42:45 2017 -0700
Committer: Felix Cheung <fe...@apache.org>
Committed: Wed Jun 21 20:42:45 2017 -0700
----------------------------------------------------------------------
R/pkg/R/mllib_classification.R | 61 +++++++++++++++++++-
.../tests/fulltests/test_mllib_classification.R | 40 +++++++++++++
.../ml/classification/LogisticRegression.scala | 8 +--
.../spark/ml/r/LogisticRegressionWrapper.scala | 34 ++++++++++-
4 files changed, 135 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/53543374/R/pkg/R/mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index bdcc081..82d2428 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) {
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
#' or the number of partitions are large, this param could be adjusted to a larger size.
#' This is an expert parameter. Default value should be good for most cases.
+#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
+#' The bound matrix must be compatible with the shape (1, number of features) for binomial
+#' regression, or (number of classes, number of features) for multinomial regression.
+#' It is a R matrix.
+#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
+#' The bound matrix must be compatible with the shape (1, number of features) for binomial
+#' regression, or (number of classes, number of features) for multinomial regression.
+#' It is a R matrix.
+#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
+#' The bounds vector size must be equal to 1 for binomial regression, or the number
+#' of classes for multinomial regression.
+#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
+#' The bound vector size must be equal to 1 for binomial regression, or the number
+#' of classes for multinomial regression.
#' @param ... additional arguments passed to the method.
#' @return \code{spark.logit} returns a fitted logistic regression model.
#' @rdname spark.logit
@@ -241,8 +255,12 @@ function(object, path, overwrite = FALSE) {
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
tol = 1E-6, family = "auto", standardization = TRUE,
- thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
+ thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
+ lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
+ lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
formula <- paste(deparse(formula), collapse = "")
+ row <- 0
+ col <- 0
if (!is.null(weightCol) && weightCol == "") {
weightCol <- NULL
@@ -250,12 +268,51 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
weightCol <- as.character(weightCol)
}
+ if (!is.null(lowerBoundsOnIntercepts)) {
+ lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
+ }
+
+ if (!is.null(upperBoundsOnIntercepts)) {
+ upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
+ }
+
+ if (!is.null(lowerBoundsOnCoefficients)) {
+ if (class(lowerBoundsOnCoefficients) != "matrix") {
+ stop("lowerBoundsOnCoefficients must be a matrix.")
+ }
+ row <- nrow(lowerBoundsOnCoefficients)
+ col <- ncol(lowerBoundsOnCoefficients)
+ lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
+ }
+
+ if (!is.null(upperBoundsOnCoefficients)) {
+ if (class(upperBoundsOnCoefficients) != "matrix") {
+ stop("upperBoundsOnCoefficients must be a matrix.")
+ }
+
+ if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
+ || col != ncol(upperBoundsOnCoefficients))) {
+ stop(paste0("dimension of upperBoundsOnCoefficients ",
+ "is not the same as lowerBoundsOnCoefficients", sep = ""))
+ }
+
+ if (is.null(lowerBoundsOnCoefficients)) {
+ row <- nrow(upperBoundsOnCoefficients)
+ col <- ncol(upperBoundsOnCoefficients)
+ }
+
+ upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
+ }
+
jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
data@sdf, formula, as.numeric(regParam),
as.numeric(elasticNetParam), as.integer(maxIter),
as.numeric(tol), as.character(family),
as.logical(standardization), as.array(thresholds),
- weightCol, as.integer(aggregationDepth))
+ weightCol, as.integer(aggregationDepth),
+ as.integer(row), as.integer(col),
+ lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
+ lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
new("LogisticRegressionModel", jobj = jobj)
})
http://git-wip-us.apache.org/repos/asf/spark/blob/53543374/R/pkg/tests/fulltests/test_mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
index 726e9d9..3d75f4c 100644
--- a/R/pkg/tests/fulltests/test_mllib_classification.R
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -223,6 +223,46 @@ test_that("spark.logit", {
model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
prediction2 <- collect(select(predict(model2, df2), "prediction"))
expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
+
+ # Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
+ # and upperBoundsOnIntercepts
+ u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
+ model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
+ upperBoundsOnIntercepts = 1.0)
+ summary <- summary(model)
+ coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
+ coefs <- summary$coefficients[, "Estimate"]
+ expect_true(all(abs(coefsR - coefs) < 0.1))
+ # Test upperBoundsOnCoefficients should be matrix
+ expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
+ upperBoundsOnIntercepts = 1.0))
+
+ # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
+ # and lowerBoundsOnIntercepts
+ l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
+ model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
+ lowerBoundsOnIntercepts = 0.0)
+ summary <- summary(model)
+ coefsR <- c(0, 0, -1, 0, 1.902192)
+ coefs <- summary$coefficients[, "Estimate"]
+ expect_true(all(abs(coefsR - coefs) < 0.1))
+ # Test lowerBoundsOnCoefficients should be matrix
+ expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)),
+ lowerBoundsOnIntercepts = 0.0))
+
+ # Test multinomial logistic regression with lowerBoundsOnCoefficients
+ # and lowerBoundsOnIntercepts
+ l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
+ model <- spark.logit(training, Species ~ ., family = "multinomial",
+ lowerBoundsOnCoefficients = l,
+ lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
+ summary <- summary(model)
+ versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
+ virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
+ versicolorCoefs <- summary$coefficients[, "versicolor"]
+ virginicaCoefs <- summary$coefficients[, "virginica"]
+ expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+ expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
})
test_that("spark.mlp", {
http://git-wip-us.apache.org/repos/asf/spark/blob/53543374/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 567af04..b234bc4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
/**
* The lower bounds on intercepts if fitting under bound constrained optimization.
- * The bounds vector size must be equal with 1 for binomial regression, or the number
+ * The bounds vector size must be equal to 1 for binomial regression, or the number
* of classes for multinomial regression. Otherwise, it throws exception.
* Default is none.
*
@@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
/**
* The upper bounds on intercepts if fitting under bound constrained optimization.
- * The bound vector size must be equal with 1 for binomial regression, or the number
+ * The bound vector size must be equal to 1 for binomial regression, or the number
* of classes for multinomial regression. Otherwise, it throws exception.
* Default is none.
*
@@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (
}
if (isSet(lowerBoundsOnIntercepts)) {
require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
- "lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
+ "lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")
}
if (isSet(upperBoundsOnIntercepts)) {
require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
- "upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
+ "upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")
}
if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {
http://git-wip-us.apache.org/repos/asf/spark/blob/53543374/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
index 703bcdf..b96481a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset}
@@ -97,7 +97,13 @@ private[r] object LogisticRegressionWrapper
standardization: Boolean,
thresholds: Array[Double],
weightCol: String,
- aggregationDepth: Int
+ aggregationDepth: Int,
+ numRowsOfBoundsOnCoefficients: Int,
+ numColsOfBoundsOnCoefficients: Int,
+ lowerBoundsOnCoefficients: Array[Double],
+ upperBoundsOnCoefficients: Array[Double],
+ lowerBoundsOnIntercepts: Array[Double],
+ upperBoundsOnIntercepts: Array[Double]
): LogisticRegressionWrapper = {
val rFormula = new RFormula()
@@ -133,6 +139,30 @@ private[r] object LogisticRegressionWrapper
if (weightCol != null) lr.setWeightCol(weightCol)
+ if (numRowsOfBoundsOnCoefficients != 0 &&
+ numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) {
+ val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
+ numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients)
+ lr.setLowerBoundsOnCoefficients(coef)
+ }
+
+ if (numRowsOfBoundsOnCoefficients != 0 &&
+ numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) {
+ val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
+ numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients)
+ lr.setUpperBoundsOnCoefficients(coef)
+ }
+
+ if (lowerBoundsOnIntercepts != null) {
+ val intercept = Vectors.dense(lowerBoundsOnIntercepts)
+ lr.setLowerBoundsOnIntercepts(intercept)
+ }
+
+ if (upperBoundsOnIntercepts != null) {
+ val intercept = Vectors.dense(upperBoundsOnIntercepts)
+ lr.setUpperBoundsOnIntercepts(intercept)
+ }
+
val idxToStr = new IndexToString()
.setInputCol(PREDICTED_LABEL_INDEX_COL)
.setOutputCol(PREDICTED_LABEL_COL)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org