You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2016/01/27 00:53:56 UTC
spark git commit: [SPARK-10509][PYSPARK] Reduce excessive param boiler plate code

Repository: spark
Updated Branches:
  refs/heads/master 19fdb21af -> eb917291c


[SPARK-10509][PYSPARK] Reduce excessive param boiler plate code

The current python ml params require cut-and-pasting the param setup and description between the class & ```__init__``` methods. Remove this possible case of errors & simplify use of custom params by adding a ```_copy_new_parent``` method to param so as to avoid cut and pasting (and cut and pasting at different indentation levels urgh).

Author: Holden Karau <ho...@us.ibm.com>

Closes #10216 from holdenk/SPARK-10509-excessive-param-boiler-plate-code.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eb917291
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eb917291
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eb917291

Branch: refs/heads/master
Commit: eb917291ca1a2d68ca0639cb4b1464a546603eba
Parents: 19fdb21
Author: Holden Karau <ho...@us.ibm.com>
Authored: Tue Jan 26 15:53:48 2016 -0800
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Tue Jan 26 15:53:48 2016 -0800

----------------------------------------------------------------------
 python/pyspark/ml/classification.py             | 32 -------
 python/pyspark/ml/clustering.py                 |  7 --
 python/pyspark/ml/evaluation.py                 | 12 ---
 python/pyspark/ml/feature.py                    | 98 +-------------------
 python/pyspark/ml/param/__init__.py             | 22 +++++
 .../pyspark/ml/param/_shared_params_code_gen.py | 17 +---
 python/pyspark/ml/param/shared.py               | 81 +---------------
 python/pyspark/ml/pipeline.py                   |  4 +-
 python/pyspark/ml/recommendation.py             | 11 ---
 python/pyspark/ml/regression.py                 | 46 ---------
 python/pyspark/ml/tests.py                      | 12 +++
 python/pyspark/ml/tuning.py                     | 18 ----
 12 files changed, 43 insertions(+), 317 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 265c6a1..3179fb3 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -72,7 +72,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     .. versionadded:: 1.3.0
     """
 
-    # a placeholder to make it appear in the generated doc
     threshold = Param(Params._dummy(), "threshold",
                       "Threshold in binary classification prediction, in range [0, 1]." +
                       " If threshold and thresholds are both set, they must match.")
@@ -92,10 +91,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
         super(LogisticRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.LogisticRegression", self.uid)
-        #: param for threshold in binary classification, in range [0, 1].
-        self.threshold = Param(self, "threshold",
-                               "Threshold in binary classification prediction, in range [0, 1]." +
-                               " If threshold and thresholds are both set, they must match.")
         self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -232,7 +227,6 @@ class TreeClassifierParams(object):
     """
     supportedImpurities = ["entropy", "gini"]
 
-    # a placeholder to make it appear in the generated doc
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
                      "Supported options: " +
@@ -240,10 +234,6 @@ class TreeClassifierParams(object):
 
     def __init__(self):
         super(TreeClassifierParams, self).__init__()
-        #: param for Criterion used for information gain calculation (case-insensitive).
-        self.impurity = Param(self, "impurity", "Criterion used for information " +
-                              "gain calculation (case-insensitive). Supported options: " +
-                              ", ".join(self.supportedImpurities))
 
     @since("1.6.0")
     def setImpurity(self, value):
@@ -485,7 +475,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
                      "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
@@ -504,10 +493,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
         super(GBTClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.GBTClassifier", self.uid)
-        #: param for Loss function which GBT tries to minimize (case-insensitive).
-        self.lossType = Param(self, "lossType",
-                              "Loss function which GBT tries to minimize (case-insensitive). " +
-                              "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
                          lossType="logistic", maxIter=20, stepSize=0.1)
@@ -597,7 +582,6 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
                       "default is 1.0")
     modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
@@ -615,13 +599,6 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
         super(NaiveBayes, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.NaiveBayes", self.uid)
-        #: param for the smoothing parameter.
-        self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " +
-                               "default is 1.0")
-        #: param for the model type.
-        self.modelType = Param(self, "modelType", "The model type which is a string " +
-                               "(case-sensitive). Supported options: multinomial (default) " +
-                               "and bernoulli.")
         self._setDefault(smoothing=1.0, modelType="multinomial")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -734,7 +711,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
                    "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
                    "neurons and output layer of 10 neurons, default is [1, 1].")
@@ -753,14 +729,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
         super(MultilayerPerceptronClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
-        self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
-                            "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
-                            "100 neurons and output layer of 10 neurons, default is [1, 1].")
-        self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
-                               "matrices. Data is stacked within partitions. If block size is " +
-                               "more than remaining data in a partition then it is adjusted to " +
-                               "the size of this data. Recommended size is between 10 and 1000, " +
-                               "default is 128.")
         self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 9189c02..60d1c9a 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -73,7 +73,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     k = Param(Params._dummy(), "k", "number of clusters to create")
     initMode = Param(Params._dummy(), "initMode",
                      "the initialization algorithm. This can be either \"random\" to " +
@@ -90,12 +89,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
         """
         super(KMeans, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
-        self.k = Param(self, "k", "number of clusters to create")
-        self.initMode = Param(self, "initMode",
-                              "the initialization algorithm. This can be either \"random\" to " +
-                              "choose random points as initial cluster centers, or \"k-means||\" " +
-                              "to use a parallel variant of k-means++")
-        self.initSteps = Param(self, "initSteps", "steps for k-means initialization mode")
         self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/evaluation.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 6ff68ab..c9b95b3 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -124,7 +124,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     metricName = Param(Params._dummy(), "metricName",
                        "metric name in evaluation (areaUnderROC|areaUnderPR)")
 
@@ -138,9 +137,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
         super(BinaryClassificationEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
-        #: param for metric name in evaluation (areaUnderROC|areaUnderPR)
-        self.metricName = Param(self, "metricName",
-                                "metric name in evaluation (areaUnderROC|areaUnderPR)")
         self._setDefault(rawPredictionCol="rawPrediction", labelCol="label",
                          metricName="areaUnderROC")
         kwargs = self.__init__._input_kwargs
@@ -210,9 +206,6 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
         super(RegressionEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
-        #: param for metric name in evaluation (mse|rmse|r2|mae)
-        self.metricName = Param(self, "metricName",
-                                "metric name in evaluation (mse|rmse|r2|mae)")
         self._setDefault(predictionCol="prediction", labelCol="label",
                          metricName="rmse")
         kwargs = self.__init__._input_kwargs
@@ -265,7 +258,6 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
 
     .. versionadded:: 1.5.0
     """
-    # a placeholder to make it appear in the generated doc
     metricName = Param(Params._dummy(), "metricName",
                        "metric name in evaluation "
                        "(f1|precision|recall|weightedPrecision|weightedRecall)")
@@ -280,10 +272,6 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
         super(MulticlassClassificationEvaluator, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
-        # param for metric name in evaluation (f1|precision|recall|weightedPrecision|weightedRecall)
-        self.metricName = Param(self, "metricName",
-                                "metric name in evaluation"
-                                " (f1|precision|recall|weightedPrecision|weightedRecall)")
         self._setDefault(predictionCol="prediction", labelCol="label",
                          metricName="f1")
         kwargs = self.__init__._input_kwargs

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 32f3246..2208123 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -57,7 +57,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     threshold = Param(Params._dummy(), "threshold",
                       "threshold in binary classification prediction, in range [0, 1]")
 
@@ -68,8 +67,6 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(Binarizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Binarizer", self.uid)
-        self.threshold = Param(self, "threshold",
-                               "threshold in binary classification prediction, in range [0, 1]")
         self._setDefault(threshold=0.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -125,7 +122,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.3.0
     """
 
-    # a placeholder to make it appear in the generated doc
     splits = \
         Param(Params._dummy(), "splits",
               "Split points for mapping continuous features into buckets. With n+1 splits, " +
@@ -142,19 +138,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(Bucketizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
-        #: param for Splitting points for mapping continuous features into buckets. With n+1 splits,
-        #  there are n buckets. A bucket defined by splits x,y holds values in the range [x,y)
-        #  except the last bucket, which also includes y. The splits should be strictly increasing.
-        #  Values at -inf, inf must be explicitly provided to cover all Double values; otherwise,
-        #  values outside the splits specified will be treated as errors.
-        self.splits = \
-            Param(self, "splits",
-                  "Split points for mapping continuous features into buckets. With n+1 splits, " +
-                  "there are n buckets. A bucket defined by splits x,y holds values in the " +
-                  "range [x,y) except the last bucket, which also includes y. The splits " +
-                  "should be strictly increasing. Values at -inf, inf must be explicitly " +
-                  "provided to cover all Double values; otherwise, values outside the splits " +
-                  "specified will be treated as errors.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -210,7 +193,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     minTF = Param(
         Params._dummy(), "minTF", "Filter to ignore rare words in" +
         " a document. For each document, terms with frequency/count less than the given" +
@@ -235,22 +217,6 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
         super(CountVectorizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
                                             self.uid)
-        self.minTF = Param(
-            self, "minTF", "Filter to ignore rare words in" +
-            " a document. For each document, terms with frequency/count less than the given" +
-            " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
-            " times the term must appear in the document); if this is a double in [0,1), then " +
-            "this specifies a fraction (out of the document's token count). Note that the " +
-            "parameter is only used in transform of CountVectorizerModel and does not affect" +
-            "fitting. Default 1.0")
-        self.minDF = Param(
-            self, "minDF", "Specifies the minimum number of" +
-            " different documents a term must appear in to be included in the vocabulary." +
-            " If this is an integer >= 1, this specifies the number of documents the term must" +
-            " appear in; if this is a double in [0,1), then this specifies the fraction of " +
-            "documents. Default 1.0")
-        self.vocabSize = Param(
-            self, "vocabSize", "max size of the vocabulary. Default 1 << 18.")
         self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -359,7 +325,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
                     "default False.")
 
@@ -370,8 +335,6 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(DCT, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.DCT", self.uid)
-        self.inverse = Param(self, "inverse", "Set transformer to perform inverse DCT, " +
-                             "default False.")
         self._setDefault(inverse=False)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -423,7 +386,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
                        "it must be MLlib Vector type.")
 
@@ -435,8 +397,6 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
         super(ElementwiseProduct, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ElementwiseProduct",
                                             self.uid)
-        self.scalingVec = Param(self, "scalingVec", "vector for hadamard product, " +
-                                "it must be MLlib Vector type.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -531,7 +491,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     minDocFreq = Param(Params._dummy(), "minDocFreq",
                        "minimum of documents in which a term should appear for filtering")
 
@@ -542,8 +501,6 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(IDF, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IDF", self.uid)
-        self.minDocFreq = Param(self, "minDocFreq",
-                                "minimum of documents in which a term should appear for filtering")
         self._setDefault(minDocFreq=0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -623,7 +580,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
     max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
 
@@ -634,8 +590,6 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(MinMaxScaler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.MinMaxScaler", self.uid)
-        self.min = Param(self, "min", "Lower bound of the output feature range")
-        self.max = Param(self, "max", "Upper bound of the output feature range")
         self._setDefault(min=0.0, max=1.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -745,7 +699,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
 
     @keyword_only
@@ -755,7 +708,6 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(NGram, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.NGram", self.uid)
-        self.n = Param(self, "n", "number of elements per n-gram (>=1)")
         self._setDefault(n=2)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -808,7 +760,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     p = Param(Params._dummy(), "p", "the p norm value.")
 
     @keyword_only
@@ -818,7 +769,6 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(Normalizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Normalizer", self.uid)
-        self.p = Param(self, "p", "the p norm value.")
         self._setDefault(p=2.0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -887,7 +837,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category")
 
     @keyword_only
@@ -897,7 +846,6 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(OneHotEncoder, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.OneHotEncoder", self.uid)
-        self.dropLast = Param(self, "dropLast", "whether to drop the last category")
         self._setDefault(dropLast=True)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -950,7 +898,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")
 
     @keyword_only
@@ -961,7 +908,6 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol):
         super(PolynomialExpansion, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.feature.PolynomialExpansion", self.uid)
-        self.degree = Param(self, "degree", "the polynomial degree to expand (>= 1)")
         self._setDefault(degree=2)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1107,7 +1053,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
     gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
     pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
@@ -1123,11 +1068,6 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(RegexTokenizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
-        self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
-        self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
-        self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
-        self.toLowercase = Param(self, "toLowercase", "whether to convert all characters to " +
-                                 "lowercase before tokenizing")
         self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+", toLowercase=True)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1223,7 +1163,6 @@ class SQLTransformer(JavaTransformer):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     statement = Param(Params._dummy(), "statement", "SQL statement")
 
     @keyword_only
@@ -1233,7 +1172,6 @@ class SQLTransformer(JavaTransformer):
         """
         super(SQLTransformer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
-        self.statement = Param(self, "statement", "SQL statement")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1285,7 +1223,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     withMean = Param(Params._dummy(), "withMean", "Center data with mean")
     withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
 
@@ -1296,8 +1233,6 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(StandardScaler, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StandardScaler", self.uid)
-        self.withMean = Param(self, "withMean", "Center data with mean")
-        self.withStd = Param(self, "withStd", "Scale to unit standard deviation")
         self._setDefault(withMean=False, withStd=True)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1453,7 +1388,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make the labels show up in generated doc
     labels = Param(Params._dummy(), "labels",
                    "Optional array of labels specifying index-string mapping." +
                    " If not provided or if empty, then metadata from inputCol is used instead.")
@@ -1466,9 +1400,6 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol):
         super(IndexToString, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.IndexToString",
                                             self.uid)
-        self.labels = Param(self, "labels",
-                            "Optional array of labels specifying index-string mapping. If not" +
-                            " provided or if empty, then metadata from inputCol is used instead.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1507,7 +1438,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
 
     .. versionadded:: 1.6.0
     """
-    # a placeholder to make the stopwords show up in generated doc
+
     stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
     caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
                           "comparison over the stop words")
@@ -1522,9 +1453,6 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol):
         super(StopWordsRemover, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover",
                                             self.uid)
-        self.stopWords = Param(self, "stopWords", "The words to be filtered out")
-        self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " +
-                                   "sensitive comparison over the stop words")
         stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
         defaultStopWords = stopWordsObj.English()
         self._setDefault(stopWords=defaultStopWords)
@@ -1727,7 +1655,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     maxCategories = Param(Params._dummy(), "maxCategories",
                           "Threshold for the number of values a categorical feature can take " +
                           "(>= 2). If a feature is found to have > maxCategories values, then " +
@@ -1740,10 +1667,6 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(VectorIndexer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorIndexer", self.uid)
-        self.maxCategories = Param(self, "maxCategories",
-                                   "Threshold for the number of values a categorical feature " +
-                                   "can take (>= 2). If a feature is found to have " +
-                                   "> maxCategories values, then it is declared continuous.")
         self._setDefault(maxCategories=20)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -1832,7 +1755,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
                     "a vector column. There can be no overlap with names.")
     names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
@@ -1847,12 +1769,6 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):
         """
         super(VectorSlicer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSlicer", self.uid)
-        self.indices = Param(self, "indices", "An array of indices to select features from " +
-                             "a vector column. There can be no overlap with names.")
-        self.names = Param(self, "names", "An array of feature names to select features from " +
-                           "a vector column. These names must be specified by ML " +
-                           "org.apache.spark.ml.attribute.Attribute. There can be no overlap " +
-                           "with indices.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -1932,7 +1848,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     vectorSize = Param(Params._dummy(), "vectorSize",
                        "the dimension of codes after transforming from words")
     numPartitions = Param(Params._dummy(), "numPartitions",
@@ -1950,13 +1865,6 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
         """
         super(Word2Vec, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid)
-        self.vectorSize = Param(self, "vectorSize",
-                                "the dimension of codes after transforming from words")
-        self.numPartitions = Param(self, "numPartitions",
-                                   "number of partitions for sentences of words")
-        self.minCount = Param(self, "minCount",
-                              "the minimum number of times a token must appear to be included " +
-                              "in the word2vec model's vocabulary")
         self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
                          seed=None)
         kwargs = self.__init__._input_kwargs
@@ -2075,7 +1983,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     k = Param(Params._dummy(), "k", "the number of principal components")
 
     @keyword_only
@@ -2085,7 +1992,6 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol):
         """
         super(PCA, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.PCA", self.uid)
-        self.k = Param(self, "k", "the number of principal components")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
@@ -2185,7 +2091,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
     .. versionadded:: 1.5.0
     """
 
-    # a placeholder to make it appear in the generated doc
     formula = Param(Params._dummy(), "formula", "R model formula")
 
     @keyword_only
@@ -2195,7 +2100,6 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol):
         """
         super(RFormula, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RFormula", self.uid)
-        self.formula = Param(self, "formula", "R model formula")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/param/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 92ce96a..3da36d3 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -40,6 +40,15 @@ class Param(object):
         self.doc = str(doc)
         self.expectedType = expectedType
 
+    def _copy_new_parent(self, parent):
+        """Copy the current param to a new parent, must be a dummy param."""
+        if self.parent == "undefined":
+            param = copy.copy(self)
+            param.parent = parent.uid
+            return param
+        else:
+            raise ValueError("Cannot copy from non-dummy parent %s." % parent)
+
     def __str__(self):
         return str(self.parent) + "__" + self.name
 
@@ -77,6 +86,19 @@ class Params(Identifiable):
         #: value returned by :py:func:`params`
         self._params = None
 
+        # Copy the params from the class to the object
+        self._copy_params()
+
+    def _copy_params(self):
+        """
+        Copy all params defined on the class to current object.
+        """
+        cls = type(self)
+        src_name_attrs = [(x, getattr(cls, x)) for x in dir(cls)]
+        src_params = list(filter(lambda nameAttr: isinstance(nameAttr[1], Param), src_name_attrs))
+        for name, param in src_params:
+            setattr(self, name, param._copy_new_parent(self))
+
     @property
     @since("1.3.0")
     def params(self):

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/param/_shared_params_code_gen.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 82855bc..5e297b8 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -50,13 +50,11 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType):
     Mixin for param $name: $doc
     """
 
-    # a placeholder to make it appear in the generated doc
     $name = Param(Params._dummy(), "$name", "$doc", $expectedType)
 
     def __init__(self):
-        super(Has$Name, self).__init__()
-        #: param for $doc
-        self.$name = Param(self, "$name", "$doc", $expectedType)'''
+        super(Has$Name, self).__init__()'''
+
     if defaultValueStr is not None:
         template += '''
         self._setDefault($name=$defaultValueStr)'''
@@ -171,22 +169,17 @@ if __name__ == "__main__":
     Mixin for Decision Tree parameters.
     """
 
-    # a placeholder to make it appear in the generated doc
     $dummyPlaceHolders
 
     def __init__(self):
-        super(DecisionTreeParams, self).__init__()
-        $realParams'''
+        super(DecisionTreeParams, self).__init__()'''
     dtParamMethods = ""
     dummyPlaceholders = ""
-    realParams = ""
     paramTemplate = """$name = Param($owner, "$name", "$doc")"""
     for name, doc in decisionTreeParams:
         variable = paramTemplate.replace("$name", name).replace("$doc", doc)
         dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n    "
-        realParams += "#: param for " + doc + "\n        "
-        realParams += "self." + variable.replace("$owner", "self") + "\n        "
         dtParamMethods += _gen_param_code(name, doc, None) + "\n"
-    code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders)
-                .replace("$realParams", realParams) + dtParamMethods)
+    code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" +
+                dtParamMethods)
     print("\n\n\n".join(code))

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/param/shared.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 23f9431..db4a8a5 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -25,13 +25,10 @@ class HasMaxIter(Params):
     Mixin for param maxIter: max number of iterations (>= 0).
     """
 
-    # a placeholder to make it appear in the generated doc
     maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).", int)
 
     def __init__(self):
         super(HasMaxIter, self).__init__()
-        #: param for max number of iterations (>= 0).
-        self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0).", int)
 
     def setMaxIter(self, value):
         """
@@ -52,13 +49,10 @@ class HasRegParam(Params):
     Mixin for param regParam: regularization parameter (>= 0).
     """
 
-    # a placeholder to make it appear in the generated doc
     regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).", float)
 
     def __init__(self):
         super(HasRegParam, self).__init__()
-        #: param for regularization parameter (>= 0).
-        self.regParam = Param(self, "regParam", "regularization parameter (>= 0).", float)
 
     def setRegParam(self, value):
         """
@@ -79,13 +73,10 @@ class HasFeaturesCol(Params):
     Mixin for param featuresCol: features column name.
     """
 
-    # a placeholder to make it appear in the generated doc
     featuresCol = Param(Params._dummy(), "featuresCol", "features column name.", str)
 
     def __init__(self):
         super(HasFeaturesCol, self).__init__()
-        #: param for features column name.
-        self.featuresCol = Param(self, "featuresCol", "features column name.", str)
         self._setDefault(featuresCol='features')
 
     def setFeaturesCol(self, value):
@@ -107,13 +98,10 @@ class HasLabelCol(Params):
     Mixin for param labelCol: label column name.
     """
 
-    # a placeholder to make it appear in the generated doc
     labelCol = Param(Params._dummy(), "labelCol", "label column name.", str)
 
     def __init__(self):
         super(HasLabelCol, self).__init__()
-        #: param for label column name.
-        self.labelCol = Param(self, "labelCol", "label column name.", str)
         self._setDefault(labelCol='label')
 
     def setLabelCol(self, value):
@@ -135,13 +123,10 @@ class HasPredictionCol(Params):
     Mixin for param predictionCol: prediction column name.
     """
 
-    # a placeholder to make it appear in the generated doc
     predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.", str)
 
     def __init__(self):
         super(HasPredictionCol, self).__init__()
-        #: param for prediction column name.
-        self.predictionCol = Param(self, "predictionCol", "prediction column name.", str)
         self._setDefault(predictionCol='prediction')
 
     def setPredictionCol(self, value):
@@ -163,13 +148,10 @@ class HasProbabilityCol(Params):
     Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
     """
 
-    # a placeholder to make it appear in the generated doc
     probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str)
 
     def __init__(self):
         super(HasProbabilityCol, self).__init__()
-        #: param for Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
-        self.probabilityCol = Param(self, "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str)
         self._setDefault(probabilityCol='probability')
 
     def setProbabilityCol(self, value):
@@ -191,13 +173,10 @@ class HasRawPredictionCol(Params):
     Mixin for param rawPredictionCol: raw prediction (a.k.a. confidence) column name.
     """
 
-    # a placeholder to make it appear in the generated doc
     rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str)
 
     def __init__(self):
         super(HasRawPredictionCol, self).__init__()
-        #: param for raw prediction (a.k.a. confidence) column name.
-        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str)
         self._setDefault(rawPredictionCol='rawPrediction')
 
     def setRawPredictionCol(self, value):
@@ -219,13 +198,10 @@ class HasInputCol(Params):
     Mixin for param inputCol: input column name.
     """
 
-    # a placeholder to make it appear in the generated doc
     inputCol = Param(Params._dummy(), "inputCol", "input column name.", str)
 
     def __init__(self):
         super(HasInputCol, self).__init__()
-        #: param for input column name.
-        self.inputCol = Param(self, "inputCol", "input column name.", str)
 
     def setInputCol(self, value):
         """
@@ -246,13 +222,10 @@ class HasInputCols(Params):
     Mixin for param inputCols: input column names.
     """
 
-    # a placeholder to make it appear in the generated doc
     inputCols = Param(Params._dummy(), "inputCols", "input column names.", None)
 
     def __init__(self):
         super(HasInputCols, self).__init__()
-        #: param for input column names.
-        self.inputCols = Param(self, "inputCols", "input column names.", None)
 
     def setInputCols(self, value):
         """
@@ -273,13 +246,10 @@ class HasOutputCol(Params):
     Mixin for param outputCol: output column name.
     """
 
-    # a placeholder to make it appear in the generated doc
     outputCol = Param(Params._dummy(), "outputCol", "output column name.", str)
 
     def __init__(self):
         super(HasOutputCol, self).__init__()
-        #: param for output column name.
-        self.outputCol = Param(self, "outputCol", "output column name.", str)
         self._setDefault(outputCol=self.uid + '__output')
 
     def setOutputCol(self, value):
@@ -301,13 +271,10 @@ class HasNumFeatures(Params):
     Mixin for param numFeatures: number of features.
     """
 
-    # a placeholder to make it appear in the generated doc
     numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", int)
 
     def __init__(self):
         super(HasNumFeatures, self).__init__()
-        #: param for number of features.
-        self.numFeatures = Param(self, "numFeatures", "number of features.", int)
 
     def setNumFeatures(self, value):
         """
@@ -328,13 +295,10 @@ class HasCheckpointInterval(Params):
     Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
     """
 
-    # a placeholder to make it appear in the generated doc
     checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int)
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
-        #: param for set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
-        self.checkpointInterval = Param(self, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int)
 
     def setCheckpointInterval(self, value):
         """
@@ -355,13 +319,10 @@ class HasSeed(Params):
     Mixin for param seed: random seed.
     """
 
-    # a placeholder to make it appear in the generated doc
     seed = Param(Params._dummy(), "seed", "random seed.", int)
 
     def __init__(self):
         super(HasSeed, self).__init__()
-        #: param for random seed.
-        self.seed = Param(self, "seed", "random seed.", int)
         self._setDefault(seed=hash(type(self).__name__))
 
     def setSeed(self, value):
@@ -383,13 +344,10 @@ class HasTol(Params):
     Mixin for param tol: the convergence tolerance for iterative algorithms.
     """
 
-    # a placeholder to make it appear in the generated doc
     tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", float)
 
     def __init__(self):
         super(HasTol, self).__init__()
-        #: param for the convergence tolerance for iterative algorithms.
-        self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms.", float)
 
     def setTol(self, value):
         """
@@ -410,13 +368,10 @@ class HasStepSize(Params):
     Mixin for param stepSize: Step size to be used for each iteration of optimization.
     """
 
-    # a placeholder to make it appear in the generated doc
     stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization.", float)
 
     def __init__(self):
         super(HasStepSize, self).__init__()
-        #: param for Step size to be used for each iteration of optimization.
-        self.stepSize = Param(self, "stepSize", "Step size to be used for each iteration of optimization.", float)
 
     def setStepSize(self, value):
         """
@@ -437,13 +392,10 @@ class HasHandleInvalid(Params):
     Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
     """
 
-    # a placeholder to make it appear in the generated doc
     handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str)
 
     def __init__(self):
         super(HasHandleInvalid, self).__init__()
-        #: param for how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
-        self.handleInvalid = Param(self, "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str)
 
     def setHandleInvalid(self, value):
         """
@@ -464,13 +416,10 @@ class HasElasticNetParam(Params):
     Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
     """
 
-    # a placeholder to make it appear in the generated doc
     elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float)
 
     def __init__(self):
         super(HasElasticNetParam, self).__init__()
-        #: param for the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
-        self.elasticNetParam = Param(self, "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float)
         self._setDefault(elasticNetParam=0.0)
 
     def setElasticNetParam(self, value):
@@ -492,13 +441,10 @@ class HasFitIntercept(Params):
     Mixin for param fitIntercept: whether to fit an intercept term.
     """
 
-    # a placeholder to make it appear in the generated doc
     fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", bool)
 
     def __init__(self):
         super(HasFitIntercept, self).__init__()
-        #: param for whether to fit an intercept term.
-        self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.", bool)
         self._setDefault(fitIntercept=True)
 
     def setFitIntercept(self, value):
@@ -520,13 +466,10 @@ class HasStandardization(Params):
     Mixin for param standardization: whether to standardize the training features before fitting the model.
     """
 
-    # a placeholder to make it appear in the generated doc
     standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", bool)
 
     def __init__(self):
         super(HasStandardization, self).__init__()
-        #: param for whether to standardize the training features before fitting the model.
-        self.standardization = Param(self, "standardization", "whether to standardize the training features before fitting the model.", bool)
         self._setDefault(standardization=True)
 
     def setStandardization(self, value):
@@ -548,13 +491,10 @@ class HasThresholds(Params):
     Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
     """
 
-    # a placeholder to make it appear in the generated doc
     thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None)
 
     def __init__(self):
         super(HasThresholds, self).__init__()
-        #: param for Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
-        self.thresholds = Param(self, "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None)
 
     def setThresholds(self, value):
         """
@@ -575,13 +515,10 @@ class HasWeightCol(Params):
     Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0.
     """
 
-    # a placeholder to make it appear in the generated doc
     weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str)
 
     def __init__(self):
         super(HasWeightCol, self).__init__()
-        #: param for weight column name. If this is not set or empty, we treat all instance weights as 1.0.
-        self.weightCol = Param(self, "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str)
 
     def setWeightCol(self, value):
         """
@@ -602,13 +539,10 @@ class HasSolver(Params):
     Mixin for param solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
     """
 
-    # a placeholder to make it appear in the generated doc
     solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str)
 
     def __init__(self):
         super(HasSolver, self).__init__()
-        #: param for the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
-        self.solver = Param(self, "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str)
         self._setDefault(solver='auto')
 
     def setSolver(self, value):
@@ -630,7 +564,6 @@ class DecisionTreeParams(Params):
     Mixin for Decision Tree parameters.
     """
 
-    # a placeholder to make it appear in the generated doc
     maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
     maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
     minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
@@ -641,19 +574,7 @@ class DecisionTreeParams(Params):
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
-        #: param for Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.
-        self.maxDepth = Param(self, "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
-        #: param for Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.
-        self.maxBins = Param(self, "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
-        #: param for Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.
-        self.minInstancesPerNode = Param(self, "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
-        #: param for Minimum information gain for a split to be considered at a tree node.
-        self.minInfoGain = Param(self, "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
-        #: param for Maximum memory in MB allocated to histogram aggregation.
-        self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
-        #: param for If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.
-        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.")
-        
+
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/pipeline.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/pipeline.py b/python/pyspark/ml/pipeline.py
index 9f5f6ac..661074c 100644
--- a/python/pyspark/ml/pipeline.py
+++ b/python/pyspark/ml/pipeline.py
@@ -149,6 +149,8 @@ class Pipeline(Estimator):
     .. versionadded:: 1.3.0
     """
 
+    stages = Param(Params._dummy(), "stages", "pipeline stages")
+
     @keyword_only
     def __init__(self, stages=None):
         """
@@ -157,8 +159,6 @@ class Pipeline(Estimator):
         if stages is None:
             stages = []
         super(Pipeline, self).__init__()
-        #: Param for pipeline stages.
-        self.stages = Param(self, "stages", "pipeline stages")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/recommendation.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index b44c66f..08180a2 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -85,7 +85,6 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     rank = Param(Params._dummy(), "rank", "rank of the factorization")
     numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks")
     numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks")
@@ -108,16 +107,6 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
         """
         super(ALS, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.recommendation.ALS", self.uid)
-        self.rank = Param(self, "rank", "rank of the factorization")
-        self.numUserBlocks = Param(self, "numUserBlocks", "number of user blocks")
-        self.numItemBlocks = Param(self, "numItemBlocks", "number of item blocks")
-        self.implicitPrefs = Param(self, "implicitPrefs", "whether to use implicit preference")
-        self.alpha = Param(self, "alpha", "alpha for implicit preference")
-        self.userCol = Param(self, "userCol", "column name for user ids")
-        self.itemCol = Param(self, "itemCol", "column name for item ids")
-        self.ratingCol = Param(self, "ratingCol", "column name for ratings")
-        self.nonnegative = Param(self, "nonnegative",
-                                 "whether to use nonnegative constraint for least squares")
         self._setDefault(rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,
                          implicitPrefs=False, alpha=1.0, userCol="user", itemCol="item", seed=None,
                          ratingCol="rating", nonnegative=False, checkpointInterval=10)

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 401bac0..74a2248 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -162,7 +162,6 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     DenseVector([0.0, 1.0])
     """
 
-    # a placeholder to make it appear in the generated doc
     isotonic = \
         Param(Params._dummy(), "isotonic",
               "whether the output sequence should be isotonic/increasing (true) or" +
@@ -181,14 +180,6 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
         super(IsotonicRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.IsotonicRegression", self.uid)
-        self.isotonic = \
-            Param(self, "isotonic",
-                  "whether the output sequence should be isotonic/increasing (true) or" +
-                  "antitonic/decreasing (false).")
-        self.featureIndex = \
-            Param(self, "featureIndex",
-                  "The index of the feature if featuresCol is a vector column, no effect " +
-                  "otherwise.")
         self._setDefault(isotonic=True, featureIndex=0)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
@@ -262,15 +253,11 @@ class TreeEnsembleParams(DecisionTreeParams):
     Mixin for Decision Tree-based ensemble algorithms parameters.
     """
 
-    # a placeholder to make it appear in the generated doc
     subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " +
                             "used for learning each decision tree, in range (0, 1].")
 
     def __init__(self):
         super(TreeEnsembleParams, self).__init__()
-        #: param for Fraction of the training data, in range (0, 1].
-        self.subsamplingRate = Param(self, "subsamplingRate", "Fraction of the training data " +
-                                     "used for learning each decision tree, in range (0, 1].")
 
     @since("1.4.0")
     def setSubsamplingRate(self, value):
@@ -294,7 +281,6 @@ class TreeRegressorParams(Params):
     """
 
     supportedImpurities = ["variance"]
-    # a placeholder to make it appear in the generated doc
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
                      "Supported options: " +
@@ -302,10 +288,6 @@ class TreeRegressorParams(Params):
 
     def __init__(self):
         super(TreeRegressorParams, self).__init__()
-        #: param for Criterion used for information gain calculation (case-insensitive).
-        self.impurity = Param(self, "impurity", "Criterion used for information " +
-                              "gain calculation (case-insensitive). Supported options: " +
-                              ", ".join(self.supportedImpurities))
 
     @since("1.4.0")
     def setImpurity(self, value):
@@ -329,7 +311,6 @@ class RandomForestParams(TreeEnsembleParams):
     """
 
     supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"]
-    # a placeholder to make it appear in the generated doc
     numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).")
     featureSubsetStrategy = \
         Param(Params._dummy(), "featureSubsetStrategy",
@@ -338,13 +319,6 @@ class RandomForestParams(TreeEnsembleParams):
 
     def __init__(self):
         super(RandomForestParams, self).__init__()
-        #: param for Number of trees to train (>= 1).
-        self.numTrees = Param(self, "numTrees", "Number of trees to train (>= 1).")
-        #: param for The number of features to consider for splits at each tree node.
-        self.featureSubsetStrategy = \
-            Param(self, "featureSubsetStrategy",
-                  "The number of features to consider for splits at each tree node. Supported " +
-                  "options: " + ", ".join(self.supportedFeatureSubsetStrategies))
 
     @since("1.4.0")
     def setNumTrees(self, value):
@@ -609,7 +583,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
                      "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
@@ -627,10 +600,6 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
         """
         super(GBTRegressor, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid)
-        #: param for Loss function which GBT tries to minimize (case-insensitive).
-        self.lossType = Param(self, "lossType",
-                              "Loss function which GBT tries to minimize (case-insensitive). " +
-                              "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                          checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1)
@@ -713,7 +682,6 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     .. versionadded:: 1.6.0
     """
 
-    # a placeholder to make it appear in the generated doc
     censorCol = Param(Params._dummy(), "censorCol",
                       "censor column name. The value of this column could be 0 or 1. " +
                       "If the value is 1, it means the event has occurred i.e. " +
@@ -739,20 +707,6 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
         super(AFTSurvivalRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.AFTSurvivalRegression", self.uid)
-        #: Param for censor column name
-        self.censorCol = Param(self,  "censorCol",
-                               "censor column name. The value of this column could be 0 or 1. " +
-                               "If the value is 1, it means the event has occurred i.e. " +
-                               "uncensored; otherwise censored.")
-        #: Param for quantile probabilities array
-        self.quantileProbabilities = \
-            Param(self, "quantileProbabilities",
-                  "quantile probabilities array. Values of the quantile probabilities array " +
-                  "should be in the range (0, 1) and the array should be non-empty.")
-        #: Param for quantiles column name
-        self.quantilesCol = Param(self, "quantilesCol",
-                                  "quantiles column name. This column will output quantiles of " +
-                                  "corresponding quantileProbabilities if it is set.")
         self._setDefault(censorCol="censor",
                          quantileProbabilities=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
         kwargs = self.__init__._input_kwargs

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 9ea639d..c45a159 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -185,6 +185,18 @@ class OtherTestParams(HasMaxIter, HasInputCol, HasSeed):
 
 class ParamTests(PySparkTestCase):
 
+    def test_copy_new_parent(self):
+        testParams = TestParams()
+        # Copying an instantiated param should fail
+        with self.assertRaises(ValueError):
+            testParams.maxIter._copy_new_parent(testParams)
+        # Copying a dummy param should succeed
+        TestParams.maxIter._copy_new_parent(testParams)
+        maxIter = testParams.maxIter
+        self.assertEqual(maxIter.name, "maxIter")
+        self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
+        self.assertTrue(maxIter.parent == testParams.uid)
+
     def test_param(self):
         testParams = TestParams()
         maxIter = testParams.maxIter

http://git-wip-us.apache.org/repos/asf/spark/blob/eb917291/python/pyspark/ml/tuning.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 08f8db5..0cbe97f 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -115,18 +115,11 @@ class CrossValidator(Estimator, HasSeed):
     .. versionadded:: 1.4.0
     """
 
-    # a placeholder to make it appear in the generated doc
     estimator = Param(Params._dummy(), "estimator", "estimator to be cross-validated")
-
-    # a placeholder to make it appear in the generated doc
     estimatorParamMaps = Param(Params._dummy(), "estimatorParamMaps", "estimator param maps")
-
-    # a placeholder to make it appear in the generated doc
     evaluator = Param(
         Params._dummy(), "evaluator",
         "evaluator used to select hyper-parameters that maximize the cross-validated metric")
-
-    # a placeholder to make it appear in the generated doc
     numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation")
 
     @keyword_only
@@ -137,17 +130,6 @@ class CrossValidator(Estimator, HasSeed):
                  seed=None)
         """
         super(CrossValidator, self).__init__()
-        #: param for estimator to be cross-validated
-        self.estimator = Param(self, "estimator", "estimator to be cross-validated")
-        #: param for estimator param maps
-        self.estimatorParamMaps = Param(self, "estimatorParamMaps", "estimator param maps")
-        #: param for the evaluator used to select hyper-parameters that
-        #: maximize the cross-validated metric
-        self.evaluator = Param(
-            self, "evaluator",
-            "evaluator used to select hyper-parameters that maximize the cross-validated metric")
-        #: param for number of folds for cross validation
-        self.numFolds = Param(self, "numFolds", "number of folds for cross validation")
         self._setDefault(numFolds=3)
         kwargs = self.__init__._input_kwargs
         self._set(**kwargs)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org