You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2015/06/23 07:40:26 UTC
spark git commit: [SPARK-7781] [MLLIB] gradient boosted trees.train
regressor missing max bins
Repository: spark
Updated Branches:
refs/heads/master 44fa7df64 -> 164fe2aa4
[SPARK-7781] [MLLIB] gradient boosted trees.train regressor missing max bins
Author: Holden Karau <ho...@pigscanfly.ca>
Closes #6331 from holdenk/SPARK-7781-GradientBoostedTrees.trainRegressor-missing-max-bins and squashes the following commits:
2894695 [Holden Karau] remove extra blank line
2573e8d [Holden Karau] Update the scala side of the pythonmllibapi and make the test a bit nicer too
3a09170 [Holden Karau] add maxBins to to the train method as well
af7f274 [Holden Karau] Add maxBins to GradientBoostedTrees.trainRegressor and correctly mention the default of 32 in other places where it mentioned 100
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/164fe2aa
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/164fe2aa
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/164fe2aa
Branch: refs/heads/master
Commit: 164fe2aa44993da6c77af6de5efdae47a8b3958c
Parents: 44fa7df
Author: Holden Karau <ho...@pigscanfly.ca>
Authored: Mon Jun 22 22:40:19 2015 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Mon Jun 22 22:40:19 2015 -0700
----------------------------------------------------------------------
.../spark/mllib/api/python/PythonMLLibAPI.scala | 4 +++-
python/pyspark/mllib/tests.py | 7 +++++++
python/pyspark/mllib/tree.py | 22 +++++++++++++-------
3 files changed, 24 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/164fe2aa/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 634d56d..f9a271f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -696,12 +696,14 @@ private[python] class PythonMLLibAPI extends Serializable {
lossStr: String,
numIterations: Int,
learningRate: Double,
- maxDepth: Int): GradientBoostedTreesModel = {
+ maxDepth: Int,
+ maxBins: Int): GradientBoostedTreesModel = {
val boostingStrategy = BoostingStrategy.defaultParams(algoStr)
boostingStrategy.setLoss(Losses.fromString(lossStr))
boostingStrategy.setNumIterations(numIterations)
boostingStrategy.setLearningRate(learningRate)
boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
+ boostingStrategy.treeStrategy.setMaxBins(maxBins)
boostingStrategy.treeStrategy.categoricalFeaturesInfo = categoricalFeaturesInfo.asScala.toMap
val cached = data.rdd.persist(StorageLevel.MEMORY_AND_DISK)
http://git-wip-us.apache.org/repos/asf/spark/blob/164fe2aa/python/pyspark/mllib/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index b13159e..c8d61b9 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -463,6 +463,13 @@ class ListTests(MLlibTestCase):
except ValueError:
self.fail()
+ # Verify that maxBins is being passed through
+ GradientBoostedTrees.trainRegressor(
+ rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32)
+ with self.assertRaises(Exception) as cm:
+ GradientBoostedTrees.trainRegressor(
+ rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
+
class StatTests(MLlibTestCase):
# SPARK-4023
http://git-wip-us.apache.org/repos/asf/spark/blob/164fe2aa/python/pyspark/mllib/tree.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/tree.py b/python/pyspark/mllib/tree.py
index cfcbea5..372b86a 100644
--- a/python/pyspark/mllib/tree.py
+++ b/python/pyspark/mllib/tree.py
@@ -299,7 +299,7 @@ class RandomForest(object):
1 internal node + 2 leaf nodes. (default: 4)
:param maxBins: maximum number of bins used for splitting
features
- (default: 100)
+ (default: 32)
:param seed: Random seed for bootstrapping and choosing feature
subsets.
:return: RandomForestModel that can be used for prediction
@@ -377,7 +377,7 @@ class RandomForest(object):
1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 4)
:param maxBins: maximum number of bins used for splitting
- features (default: 100)
+ features (default: 32)
:param seed: Random seed for bootstrapping and choosing feature
subsets.
:return: RandomForestModel that can be used for prediction
@@ -435,16 +435,17 @@ class GradientBoostedTrees(object):
@classmethod
def _train(cls, data, algo, categoricalFeaturesInfo,
- loss, numIterations, learningRate, maxDepth):
+ loss, numIterations, learningRate, maxDepth, maxBins):
first = data.first()
assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo,
- loss, numIterations, learningRate, maxDepth)
+ loss, numIterations, learningRate, maxDepth, maxBins)
return GradientBoostedTreesModel(model)
@classmethod
def trainClassifier(cls, data, categoricalFeaturesInfo,
- loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3):
+ loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3,
+ maxBins=32):
"""
Method to train a gradient-boosted trees model for
classification.
@@ -467,6 +468,8 @@ class GradientBoostedTrees(object):
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 3)
+ :param maxBins: maximum number of bins used for splitting
+ features (default: 32) DecisionTree requires maxBins >= max categories
:return: GradientBoostedTreesModel that can be used for
prediction
@@ -499,11 +502,12 @@ class GradientBoostedTrees(object):
[1.0, 0.0]
"""
return cls._train(data, "classification", categoricalFeaturesInfo,
- loss, numIterations, learningRate, maxDepth)
+ loss, numIterations, learningRate, maxDepth, maxBins)
@classmethod
def trainRegressor(cls, data, categoricalFeaturesInfo,
- loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3):
+ loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
+ maxBins=32):
"""
Method to train a gradient-boosted trees model for regression.
@@ -522,6 +526,8 @@ class GradientBoostedTrees(object):
contribution of each estimator. The learning rate
should be between in the interval (0, 1].
(default: 0.1)
+ :param maxBins: maximum number of bins used for splitting
+ features (default: 32) DecisionTree requires maxBins >= max categories
:param maxDepth: Maximum depth of the tree. E.g., depth 0 means
1 leaf node; depth 1 means 1 internal node + 2 leaf
nodes. (default: 3)
@@ -556,7 +562,7 @@ class GradientBoostedTrees(object):
[1.0, 0.0]
"""
return cls._train(data, "regression", categoricalFeaturesInfo,
- loss, numIterations, learningRate, maxDepth)
+ loss, numIterations, learningRate, maxDepth, maxBins)
def _test():
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org