You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by yl...@apache.org on 2017/08/02 10:10:34 UTC
spark git commit: [SPARK-20601][ML] Python API for Constrained
Logistic Regression
Repository: spark
Updated Branches:
refs/heads/master 14e75758a -> 845c039ce
[SPARK-20601][ML] Python API for Constrained Logistic Regression
## What changes were proposed in this pull request?
Python API for Constrained Logistic Regression based on #17922 , thanks for the original contribution from zero323 .
## How was this patch tested?
Unit tests.
Author: zero323 <ze...@users.noreply.github.com>
Author: Yanbo Liang <yb...@gmail.com>
Closes #18759 from yanboliang/SPARK-20601.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/845c039c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/845c039c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/845c039c
Branch: refs/heads/master
Commit: 845c039ceb1662632a97631b110e875e934894ad
Parents: 14e7575
Author: zero323 <ze...@users.noreply.github.com>
Authored: Wed Aug 2 18:10:26 2017 +0800
Committer: Yanbo Liang <yb...@gmail.com>
Committed: Wed Aug 2 18:10:26 2017 +0800
----------------------------------------------------------------------
python/pyspark/ml/classification.py | 105 +++++++++++++++++++++++++++++--
python/pyspark/ml/param/__init__.py | 11 +++-
python/pyspark/ml/tests.py | 37 +++++++++++
3 files changed, 148 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/845c039c/python/pyspark/ml/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index ab1617b..bccf8e7 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -252,18 +252,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
"be used in the model. Supported options: auto, binomial, multinomial",
typeConverter=TypeConverters.toString)
+ lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients",
+ "The lower bounds on coefficients if fitting under bound "
+ "constrained optimization. The bound matrix must be "
+ "compatible with the shape "
+ "(1, number of features) for binomial regression, or "
+ "(number of classes, number of features) "
+ "for multinomial regression.",
+ typeConverter=TypeConverters.toMatrix)
+
+ upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients",
+ "The upper bounds on coefficients if fitting under bound "
+ "constrained optimization. The bound matrix must be "
+ "compatible with the shape "
+ "(1, number of features) for binomial regression, or "
+ "(number of classes, number of features) "
+ "for multinomial regression.",
+ typeConverter=TypeConverters.toMatrix)
+
+ lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts",
+ "The lower bounds on intercepts if fitting under bound "
+ "constrained optimization. The bounds vector size must be"
+ "equal with 1 for binomial regression, or the number of"
+ "lasses for multinomial regression.",
+ typeConverter=TypeConverters.toVector)
+
+ upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts",
+ "The upper bounds on intercepts if fitting under bound "
+ "constrained optimization. The bound vector size must be "
+ "equal with 1 for binomial regression, or the number of "
+ "classes for multinomial regression.",
+ typeConverter=TypeConverters.toVector)
+
@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
- aggregationDepth=2, family="auto"):
+ aggregationDepth=2, family="auto",
+ lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
+ lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
+
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
- aggregationDepth=2, family="auto")
+ aggregationDepth=2, family="auto", \
+ lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
+ lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
If the threshold and thresholds Params are both set, they must be equivalent.
"""
super(LogisticRegression, self).__init__()
@@ -280,13 +317,17 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
threshold=0.5, thresholds=None, probabilityCol="probability",
rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
- aggregationDepth=2, family="auto"):
+ aggregationDepth=2, family="auto",
+ lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
+ lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
threshold=0.5, thresholds=None, probabilityCol="probability", \
rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
- aggregationDepth=2, family="auto")
+ aggregationDepth=2, family="auto", \
+ lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
+ lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
Sets params for logistic regression.
If the threshold and thresholds Params are both set, they must be equivalent.
"""
@@ -381,6 +422,62 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
"""
return self.getOrDefault(self.family)
+ @since("2.3.0")
+ def setLowerBoundsOnCoefficients(self, value):
+ """
+ Sets the value of :py:attr:`lowerBoundsOnCoefficients`
+ """
+ return self._set(lowerBoundsOnCoefficients=value)
+
+ @since("2.3.0")
+ def getLowerBoundsOnCoefficients(self):
+ """
+ Gets the value of :py:attr:`lowerBoundsOnCoefficients`
+ """
+ return self.getOrDefault(self.lowerBoundsOnCoefficients)
+
+ @since("2.3.0")
+ def setUpperBoundsOnCoefficients(self, value):
+ """
+ Sets the value of :py:attr:`upperBoundsOnCoefficients`
+ """
+ return self._set(upperBoundsOnCoefficients=value)
+
+ @since("2.3.0")
+ def getUpperBoundsOnCoefficients(self):
+ """
+ Gets the value of :py:attr:`upperBoundsOnCoefficients`
+ """
+ return self.getOrDefault(self.upperBoundsOnCoefficients)
+
+ @since("2.3.0")
+ def setLowerBoundsOnIntercepts(self, value):
+ """
+ Sets the value of :py:attr:`lowerBoundsOnIntercepts`
+ """
+ return self._set(lowerBoundsOnIntercepts=value)
+
+ @since("2.3.0")
+ def getLowerBoundsOnIntercepts(self):
+ """
+ Gets the value of :py:attr:`lowerBoundsOnIntercepts`
+ """
+ return self.getOrDefault(self.lowerBoundsOnIntercepts)
+
+ @since("2.3.0")
+ def setUpperBoundsOnIntercepts(self, value):
+ """
+ Sets the value of :py:attr:`upperBoundsOnIntercepts`
+ """
+ return self._set(upperBoundsOnIntercepts=value)
+
+ @since("2.3.0")
+ def getUpperBoundsOnIntercepts(self):
+ """
+ Gets the value of :py:attr:`upperBoundsOnIntercepts`
+ """
+ return self.getOrDefault(self.upperBoundsOnIntercepts)
+
class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable):
"""
http://git-wip-us.apache.org/repos/asf/spark/blob/845c039c/python/pyspark/ml/param/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 99d8fa3..4583ae8 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -27,7 +27,7 @@ import numpy as np
from py4j.java_gateway import JavaObject
-from pyspark.ml.linalg import DenseVector, Vector
+from pyspark.ml.linalg import DenseVector, Vector, Matrix
from pyspark.ml.util import Identifiable
@@ -170,6 +170,15 @@ class TypeConverters(object):
raise TypeError("Could not convert %s to vector" % value)
@staticmethod
+ def toMatrix(value):
+ """
+ Convert a value to a MLlib Matrix, if possible.
+ """
+ if isinstance(value, Matrix):
+ return value
+ raise TypeError("Could not convert %s to matrix" % value)
+
+ @staticmethod
def toFloat(value):
"""
Convert a value to a float, if possible.
http://git-wip-us.apache.org/repos/asf/spark/blob/845c039c/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index a9ca346..7ee2c2f 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1459,6 +1459,43 @@ class GeneralizedLinearRegressionTest(SparkSessionTestCase):
self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
+class LogisticRegressionTest(SparkSessionTestCase):
+
+ def test_binomial_logistic_regression_with_bound(self):
+
+ df = self.spark.createDataFrame(
+ [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
+ (0.0, 2.0, Vectors.dense(1.0, 2.0)),
+ (1.0, 3.0, Vectors.dense(2.0, 1.0)),
+ (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])
+
+ lor = LogisticRegression(regParam=0.01, weightCol="weight",
+ lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
+ upperBoundsOnIntercepts=Vectors.dense(0.0))
+ model = lor.fit(df)
+ self.assertTrue(
+ np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
+ self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
+
+ def test_multinomial_logistic_regression_with_bound(self):
+
+ data_path = "data/mllib/sample_multiclass_classification_data.txt"
+ df = self.spark.read.format("libsvm").load(data_path)
+
+ lor = LogisticRegression(regParam=0.01,
+ lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)),
+ upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0))
+ model = lor.fit(df)
+ expected = [[4.593, 4.5516, 9.0099, 12.2904],
+ [1.0, 8.1093, 7.0, 10.0],
+ [3.041, 5.0, 8.0, 11.0]]
+ for i in range(0, len(expected)):
+ self.assertTrue(
+ np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4))
+ self.assertTrue(
+ np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4))
+
+
class FPGrowthTests(SparkSessionTestCase):
def setUp(self):
super(FPGrowthTests, self).setUp()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org