You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2016/03/31 22:00:16 UTC
spark git commit: [SPARK-14264][PYSPARK][ML] Add feature importance
for GBTs in pyspark
Repository: spark
Updated Branches:
refs/heads/master e78540282 -> b11887c08
[SPARK-14264][PYSPARK][ML] Add feature importance for GBTs in pyspark
## What changes were proposed in this pull request?
Feature importances are exposed in the python API for GBTs.
Other changes:
* Update the random forest feature importance documentation to not repeat decision tree docstring and instead place a reference to it.
## How was this patch tested?
Python doc tests were updated to validate GBT feature importance.
Author: sethah <se...@gmail.com>
Closes #12056 from sethah/Pyspark_GBT_feature_importance.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b11887c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b11887c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b11887c0
Branch: refs/heads/master
Commit: b11887c086974dbab18b9f53e99a26bbe06e9c86
Parents: e785402
Author: sethah <se...@gmail.com>
Authored: Thu Mar 31 13:00:10 2016 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Thu Mar 31 13:00:10 2016 -0700
----------------------------------------------------------------------
python/pyspark/ml/classification.py | 33 ++++++++++++++++++++++----------
python/pyspark/ml/regression.py | 33 ++++++++++++++++++++++----------
2 files changed, 46 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/b11887c0/python/pyspark/ml/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 07cafa0..f5335a3 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -396,7 +396,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel, JavaMLWritable, JavaMLR
- Normalize importances for tree to sum to 1.
Note: Feature importance for single decision trees can have high variance due to
- correlated predictor variables. Consider using a :class:`RandomForestClassifier`
+ correlated predictor variables. Consider using a :py:class:`RandomForestClassifier`
to determine feature importance instead.
"""
return self._call_java("featureImportances")
@@ -500,16 +500,12 @@ class RandomForestClassificationModel(TreeEnsembleModels):
"""
Estimate of the importance of each feature.
- This generalizes the idea of "Gini" importance to other losses,
- following the explanation of Gini importance from "Random Forests" documentation
- by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
+ Each feature's importance is the average of its importance across all trees in the ensemble
+ The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
+ (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
+ and follows the implementation from scikit-learn.
- This feature importance is calculated as follows:
- - Average over trees:
- - importance(feature j) = sum (over nodes which split on feature j) of the gain,
- where gain is scaled by the number of instances passing through node
- - Normalize importances for tree to sum to 1.
- - Normalize feature importance vector to sum to 1.
+ .. seealso:: :py:attr:`DecisionTreeClassificationModel.featureImportances`
"""
return self._call_java("featureImportances")
@@ -534,6 +530,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
>>> td = si_model.transform(df)
>>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
>>> model = gbt.fit(td)
+ >>> model.featureImportances
+ SparseVector(1, {0: 1.0})
>>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
True
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
@@ -613,6 +611,21 @@ class GBTClassificationModel(TreeEnsembleModels):
.. versionadded:: 1.4.0
"""
+ @property
+ @since("2.0.0")
+ def featureImportances(self):
+ """
+ Estimate of the importance of each feature.
+
+ Each feature's importance is the average of its importance across all trees in the ensemble
+ The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
+ (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
+ and follows the implementation from scikit-learn.
+
+ .. seealso:: :py:attr:`DecisionTreeClassificationModel.featureImportances`
+ """
+ return self._call_java("featureImportances")
+
@inherit_doc
class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol,
http://git-wip-us.apache.org/repos/asf/spark/blob/b11887c0/python/pyspark/ml/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 3764854..de8a5e4 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -533,7 +533,7 @@ class DecisionTreeRegressionModel(DecisionTreeModel, JavaMLWritable, JavaMLReada
- Normalize importances for tree to sum to 1.
Note: Feature importance for single decision trees can have high variance due to
- correlated predictor variables. Consider using a :class:`RandomForestRegressor`
+ correlated predictor variables. Consider using a :py:class:`RandomForestRegressor`
to determine feature importance instead.
"""
return self._call_java("featureImportances")
@@ -626,16 +626,12 @@ class RandomForestRegressionModel(TreeEnsembleModels):
"""
Estimate of the importance of each feature.
- This generalizes the idea of "Gini" importance to other losses,
- following the explanation of Gini importance from "Random Forests" documentation
- by Leo Breiman and Adele Cutler, and following the implementation from scikit-learn.
+ Each feature's importance is the average of its importance across all trees in the ensemble
+ The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
+ (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
+ and follows the implementation from scikit-learn.
- This feature importance is calculated as follows:
- - Average over trees:
- - importance(feature j) = sum (over nodes which split on feature j) of the gain,
- where gain is scaled by the number of instances passing through node
- - Normalize importances for tree to sum to 1.
- - Normalize feature importance vector to sum to 1.
+ .. seealso:: :py:attr:`DecisionTreeRegressionModel.featureImportances`
"""
return self._call_java("featureImportances")
@@ -655,6 +651,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
... (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
>>> gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
>>> model = gbt.fit(df)
+ >>> model.featureImportances
+ SparseVector(1, {0: 1.0})
>>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1])
True
>>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
@@ -734,6 +732,21 @@ class GBTRegressionModel(TreeEnsembleModels):
.. versionadded:: 1.4.0
"""
+ @property
+ @since("2.0.0")
+ def featureImportances(self):
+ """
+ Estimate of the importance of each feature.
+
+ Each feature's importance is the average of its importance across all trees in the ensemble
+ The importance vector is normalized to sum to 1. This method is suggested by Hastie et al.
+ (Hastie, Tibshirani, Friedman. "The Elements of Statistical Learning, 2nd Edition." 2001.)
+ and follows the implementation from scikit-learn.
+
+ .. seealso:: :py:attr:`DecisionTreeRegressionModel.featureImportances`
+ """
+ return self._call_java("featureImportances")
+
@inherit_doc
class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org