You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2016/03/23 19:20:49 UTC
spark git commit: [SPARK-13068][PYSPARK][ML] Type conversion for Pyspark params

Repository: spark
Updated Branches:
  refs/heads/master 48ee16d80 -> 30bdb5cbd


[SPARK-13068][PYSPARK][ML] Type conversion for Pyspark params

## What changes were proposed in this pull request?

This patch adds type conversion functionality for parameters in Pyspark. A `typeConverter` field is added to the constructor of `Param` class. This argument is a function which converts values passed to this param to the appropriate type if possible. This is beneficial so that the params can fail at set time if they are given inappropriate values, but even more so because coherent error messages are now provided when Py4J cannot cast the python type to the appropriate Java type.

This patch also adds a `TypeConverters` class with factory methods for common type conversions. Most of the changes involve adding these factory type converters to existing params. The previous solution to this issue, `expectedType`, is deprecated and can be removed in 2.1.0 as discussed on the Jira.

## How was this patch tested?

Unit tests were added in python/pyspark/ml/tests.py to test parameter type conversion. These tests check that values that should be convertible are converted correctly, and that the appropriate errors are thrown when invalid values are provided.

Author: sethah <se...@gmail.com>

Closes #11663 from sethah/SPARK-13068-tc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30bdb5cb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30bdb5cb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30bdb5cb

Branch: refs/heads/master
Commit: 30bdb5cbd9aec191cf15cdc83c3fee375c04c2b2
Parents: 48ee16d
Author: sethah <se...@gmail.com>
Authored: Wed Mar 23 11:20:44 2016 -0700
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Wed Mar 23 11:20:44 2016 -0700

----------------------------------------------------------------------
 python/pyspark/ml/classification.py             |  20 +-
 python/pyspark/ml/clustering.py                 |  14 +-
 python/pyspark/ml/feature.py                    |  95 ++++++----
 python/pyspark/ml/param/__init__.py             | 181 +++++++++++++++++--
 .../pyspark/ml/param/_shared_params_code_gen.py |  91 ++++++----
 python/pyspark/ml/param/shared.py               |  58 +++---
 python/pyspark/ml/recommendation.py             |  25 ++-
 python/pyspark/ml/regression.py                 |  25 ++-
 python/pyspark/ml/tests.py                      |  83 +++++++--
 python/pyspark/ml/tuning.py                     |   5 +-
 10 files changed, 421 insertions(+), 176 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
index 8075108..fdeccf8 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -20,6 +20,7 @@ import warnings
 from pyspark import since
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
+from pyspark.ml.param import TypeConverters
 from pyspark.ml.param.shared import *
 from pyspark.ml.regression import (
     RandomForestParams, TreeEnsembleParams, DecisionTreeModel, TreeEnsembleModels)
@@ -87,7 +88,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
 
     threshold = Param(Params._dummy(), "threshold",
                       "Threshold in binary classification prediction, in range [0, 1]." +
-                      " If threshold and thresholds are both set, they must match.")
+                      " If threshold and thresholds are both set, they must match.",
+                      typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
@@ -243,7 +245,7 @@ class TreeClassifierParams(object):
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation (case-insensitive). " +
                      "Supported options: " +
-                     ", ".join(supportedImpurities))
+                     ", ".join(supportedImpurities), typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(TreeClassifierParams, self).__init__()
@@ -534,7 +536,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
 
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
-                     "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
+                     "Supported options: " + ", ".join(GBTParams.supportedLossTypes),
+                     typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
@@ -652,9 +655,10 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
     """
 
     smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
-                      "default is 1.0")
+                      "default is 1.0", typeConverter=TypeConverters.toFloat)
     modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
-                      "(case-sensitive). Supported options: multinomial (default) and bernoulli.")
+                      "(case-sensitive). Supported options: multinomial (default) and bernoulli.",
+                      typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
@@ -782,11 +786,13 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
 
     layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
                    "E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
-                   "neurons and output layer of 10 neurons, default is [1, 1].")
+                   "neurons and output layer of 10 neurons, default is [1, 1].",
+                   typeConverter=TypeConverters.toListInt)
     blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking input data in " +
                       "matrices. Data is stacked within partitions. If block size is more than " +
                       "remaining data in a partition then it is adjusted to the size of this " +
-                      "data. Recommended size is between 10 and 1000, default is 128.")
+                      "data. Recommended size is between 10 and 1000, default is 128.",
+                      typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 2db5b82..e22d5c8 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -87,12 +87,14 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
     .. versionadded:: 1.5.0
     """
 
-    k = Param(Params._dummy(), "k", "number of clusters to create")
+    k = Param(Params._dummy(), "k", "number of clusters to create",
+              typeConverter=TypeConverters.toInt)
     initMode = Param(Params._dummy(), "initMode",
                      "the initialization algorithm. This can be either \"random\" to " +
                      "choose random points as initial cluster centers, or \"k-means||\" " +
-                     "to use a parallel variant of k-means++")
-    initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode")
+                     "to use a parallel variant of k-means++", TypeConverters.toString)
+    initSteps = Param(Params._dummy(), "initSteps", "steps for k-means initialization mode",
+                      typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -227,10 +229,12 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     .. versionadded:: 2.0.0
     """
 
-    k = Param(Params._dummy(), "k", "number of clusters to create")
+    k = Param(Params._dummy(), "k", "number of clusters to create",
+              typeConverter=TypeConverters.toInt)
     minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize",
                                     "the minimum number of points (if >= 1.0) " +
-                                    "or the minimum proportion")
+                                    "or the minimum proportion",
+                                    typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", maxIter=20,

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 16cb9d1..86b5328 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -83,7 +83,8 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Java
     """
 
     threshold = Param(Params._dummy(), "threshold",
-                      "threshold in binary classification prediction, in range [0, 1]")
+                      "threshold in binary classification prediction, in range [0, 1]",
+                      typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, threshold=0.0, inputCol=None, outputCol=None):
@@ -159,7 +160,8 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
               "range [x,y) except the last bucket, which also includes y. The splits " +
               "should be strictly increasing. Values at -inf, inf must be explicitly " +
               "provided to cover all Double values; otherwise, values outside the splits " +
-              "specified will be treated as errors.")
+              "specified will be treated as errors.",
+              typeConverter=TypeConverters.toListFloat)
 
     @keyword_only
     def __init__(self, splits=None, inputCol=None, outputCol=None):
@@ -243,15 +245,17 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable,
         " threshold are ignored. If this is an integer >= 1, then this specifies a count (of" +
         " times the term must appear in the document); if this is a double in [0,1), then this " +
         "specifies a fraction (out of the document's token count). Note that the parameter is " +
-        "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0")
+        "only used in transform of CountVectorizerModel and does not affect fitting. Default 1.0",
+        typeConverter=TypeConverters.toFloat)
     minDF = Param(
         Params._dummy(), "minDF", "Specifies the minimum number of" +
         " different documents a term must appear in to be included in the vocabulary." +
         " If this is an integer >= 1, this specifies the number of documents the term must" +
         " appear in; if this is a double in [0,1), then this specifies the fraction of documents." +
-        " Default 1.0")
+        " Default 1.0", typeConverter=TypeConverters.toFloat)
     vocabSize = Param(
-        Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.")
+        Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.",
+        typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None):
@@ -375,7 +379,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWrit
     """
 
     inverse = Param(Params._dummy(), "inverse", "Set transformer to perform inverse DCT, " +
-                    "default False.")
+                    "default False.", typeConverter=TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, inverse=False, inputCol=None, outputCol=None):
@@ -441,8 +445,8 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada
     .. versionadded:: 1.5.0
     """
 
-    scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard product, " +
-                       "it must be MLlib Vector type.")
+    scalingVec = Param(Params._dummy(), "scalingVec", "Vector for hadamard product.",
+                       typeConverter=TypeConverters.toVector)
 
     @keyword_only
     def __init__(self, scalingVec=None, inputCol=None, outputCol=None):
@@ -564,7 +568,8 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
     """
 
     minDocFreq = Param(Params._dummy(), "minDocFreq",
-                       "minimum of documents in which a term should appear for filtering")
+                       "minimum of documents in which a term should appear for filtering",
+                       typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, minDocFreq=0, inputCol=None, outputCol=None):
@@ -746,8 +751,10 @@ class MinMaxScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Jav
     .. versionadded:: 1.6.0
     """
 
-    min = Param(Params._dummy(), "min", "Lower bound of the output feature range")
-    max = Param(Params._dummy(), "max", "Upper bound of the output feature range")
+    min = Param(Params._dummy(), "min", "Lower bound of the output feature range",
+                typeConverter=TypeConverters.toFloat)
+    max = Param(Params._dummy(), "max", "Upper bound of the output feature range",
+                typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
@@ -870,7 +877,8 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWr
     .. versionadded:: 1.5.0
     """
 
-    n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
+    n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)",
+              typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, n=2, inputCol=None, outputCol=None):
@@ -936,7 +944,8 @@ class Normalizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
     .. versionadded:: 1.4.0
     """
 
-    p = Param(Params._dummy(), "p", "the p norm value.")
+    p = Param(Params._dummy(), "p", "the p norm value.",
+              typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, p=2.0, inputCol=None, outputCol=None):
@@ -1018,7 +1027,8 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     .. versionadded:: 1.4.0
     """
 
-    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category")
+    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category",
+                     typeConverter=TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, dropLast=True, inputCol=None, outputCol=None):
@@ -1085,7 +1095,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLRead
     .. versionadded:: 1.4.0
     """
 
-    degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)")
+    degree = Param(Params._dummy(), "degree", "the polynomial degree to expand (>= 1)",
+                   typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, degree=2, inputCol=None, outputCol=None):
@@ -1163,7 +1174,8 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, HasSeed, Jav
     # a placeholder to make it appear in the generated doc
     numBuckets = Param(Params._dummy(), "numBuckets",
                        "Maximum number of buckets (quantiles, or " +
-                       "categories) into which data points are grouped. Must be >= 2. Default 2.")
+                       "categories) into which data points are grouped. Must be >= 2. Default 2.",
+                       typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None):
@@ -1255,11 +1267,13 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     .. versionadded:: 1.4.0
     """
 
-    minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
+    minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)",
+                           typeConverter=TypeConverters.toInt)
     gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
-    pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
+    pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing",
+                    TypeConverters.toString)
     toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " +
-                        "lowercase before tokenizing")
+                        "lowercase before tokenizing", TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None,
@@ -1370,7 +1384,7 @@ class SQLTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable):
     .. versionadded:: 1.6.0
     """
 
-    statement = Param(Params._dummy(), "statement", "SQL statement")
+    statement = Param(Params._dummy(), "statement", "SQL statement", TypeConverters.toString)
 
     @keyword_only
     def __init__(self, statement=None):
@@ -1444,8 +1458,9 @@ class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, J
     .. versionadded:: 1.4.0
     """
 
-    withMean = Param(Params._dummy(), "withMean", "Center data with mean")
-    withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation")
+    withMean = Param(Params._dummy(), "withMean", "Center data with mean", TypeConverters.toBoolean)
+    withStd = Param(Params._dummy(), "withStd", "Scale to unit standard deviation",
+                    TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, withMean=False, withStd=True, inputCol=None, outputCol=None):
@@ -1628,7 +1643,8 @@ class IndexToString(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
 
     labels = Param(Params._dummy(), "labels",
                    "Optional array of labels specifying index-string mapping." +
-                   " If not provided or if empty, then metadata from inputCol is used instead.")
+                   " If not provided or if empty, then metadata from inputCol is used instead.",
+                   typeConverter=TypeConverters.toListString)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, labels=None):
@@ -1689,9 +1705,10 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl
     .. versionadded:: 1.6.0
     """
 
-    stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out")
+    stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out",
+                      typeConverter=TypeConverters.toListString)
     caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " +
-                          "comparison over the stop words")
+                          "comparison over the stop words", TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, stopWords=None,
@@ -1930,7 +1947,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
     maxCategories = Param(Params._dummy(), "maxCategories",
                           "Threshold for the number of values a categorical feature can take " +
                           "(>= 2). If a feature is found to have > maxCategories values, then " +
-                          "it is declared continuous.")
+                          "it is declared continuous.", typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
@@ -2035,11 +2052,12 @@ class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, J
     """
 
     indices = Param(Params._dummy(), "indices", "An array of indices to select features from " +
-                    "a vector column. There can be no overlap with names.")
+                    "a vector column. There can be no overlap with names.",
+                    typeConverter=TypeConverters.toListInt)
     names = Param(Params._dummy(), "names", "An array of feature names to select features from " +
                   "a vector column. These names must be specified by ML " +
                   "org.apache.spark.ml.attribute.Attribute. There can be no overlap with " +
-                  "indices.")
+                  "indices.", typeConverter=TypeConverters.toListString)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, indices=None, names=None):
@@ -2147,12 +2165,14 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has
     """
 
     vectorSize = Param(Params._dummy(), "vectorSize",
-                       "the dimension of codes after transforming from words")
+                       "the dimension of codes after transforming from words",
+                       typeConverter=TypeConverters.toInt)
     numPartitions = Param(Params._dummy(), "numPartitions",
-                          "number of partitions for sentences of words")
+                          "number of partitions for sentences of words",
+                          typeConverter=TypeConverters.toInt)
     minCount = Param(Params._dummy(), "minCount",
                      "the minimum number of times a token must appear to be included in the " +
-                     "word2vec model's vocabulary")
+                     "word2vec model's vocabulary", typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1,
@@ -2293,7 +2313,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
     .. versionadded:: 1.5.0
     """
 
-    k = Param(Params._dummy(), "k", "the number of principal components")
+    k = Param(Params._dummy(), "k", "the number of principal components",
+              typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, k=None, inputCol=None, outputCol=None):
@@ -2425,7 +2446,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, HasLabelCol, JavaMLReadable, JavaM
     .. versionadded:: 1.5.0
     """
 
-    formula = Param(Params._dummy(), "formula", "R model formula")
+    formula = Param(Params._dummy(), "formula", "R model formula", TypeConverters.toString)
 
     @keyword_only
     def __init__(self, formula=None, featuresCol="features", labelCol="label"):
@@ -2511,12 +2532,11 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     .. versionadded:: 2.0.0
     """
 
-    # a placeholder to make it appear in the generated doc
     numTopFeatures = \
         Param(Params._dummy(), "numTopFeatures",
               "Number of features that selector will select, ordered by statistics value " +
               "descending. If the number of features is < numTopFeatures, then this will select " +
-              "all features.")
+              "all features.", typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label"):
@@ -2525,11 +2545,6 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
         """
         super(ChiSqSelector, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
-        self.numTopFeatures = \
-            Param(self, "numTopFeatures",
-                  "Number of features that selector will select, ordered by statistics value " +
-                  "descending. If the number of features is < numTopFeatures, then this will " +
-                  "select all features.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/param/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index c0f0a71..a126529 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -14,31 +14,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import array
+import sys
+if sys.version > '3':
+    basestring = str
+    xrange = range
+    unicode = str
 
 from abc import ABCMeta
 import copy
+import numpy as np
+import warnings
 
 from pyspark import since
 from pyspark.ml.util import Identifiable
+from pyspark.mllib.linalg import DenseVector, Vector
 
 
-__all__ = ['Param', 'Params']
+__all__ = ['Param', 'Params', 'TypeConverters']
 
 
 class Param(object):
     """
     A param with self-contained documentation.
 
+    Note: `expectedType` is deprecated and will be removed in 2.1. Use typeConverter instead,
+          as a keyword argument.
+
     .. versionadded:: 1.3.0
     """
 
-    def __init__(self, parent, name, doc, expectedType=None):
+    def __init__(self, parent, name, doc, expectedType=None, typeConverter=None):
         if not isinstance(parent, Identifiable):
             raise TypeError("Parent must be an Identifiable but got type %s." % type(parent))
         self.parent = parent.uid
         self.name = str(name)
         self.doc = str(doc)
         self.expectedType = expectedType
+        if expectedType is not None:
+            warnings.warn("expectedType is deprecated and will be removed in 2.1. " +
+                          "Use typeConverter instead, as a keyword argument.")
+        self.typeConverter = TypeConverters.identity if typeConverter is None else typeConverter
 
     def _copy_new_parent(self, parent):
         """Copy the current param to a new parent, must be a dummy param."""
@@ -65,6 +81,146 @@ class Param(object):
             return False
 
 
+class TypeConverters(object):
+    """
+    .. note:: DeveloperApi
+
+    Factory methods for common type conversion functions for `Param.typeConverter`.
+
+    .. versionadded:: 2.0.0
+    """
+
+    @staticmethod
+    def _is_numeric(value):
+        vtype = type(value)
+        return vtype in [int, float, np.float64, np.int64] or vtype.__name__ == 'long'
+
+    @staticmethod
+    def _is_integer(value):
+        return TypeConverters._is_numeric(value) and float(value).is_integer()
+
+    @staticmethod
+    def _can_convert_to_list(value):
+        vtype = type(value)
+        return vtype in [list, np.ndarray, tuple, xrange, array.array] or isinstance(value, Vector)
+
+    @staticmethod
+    def _can_convert_to_string(value):
+        vtype = type(value)
+        return isinstance(value, basestring) or vtype in [np.unicode_, np.string_, np.str_]
+
+    @staticmethod
+    def identity(value):
+        """
+        Dummy converter that just returns value.
+        """
+        return value
+
+    @staticmethod
+    def toList(value):
+        """
+        Convert a value to a list, if possible.
+        """
+        if type(value) == list:
+            return value
+        elif type(value) in [np.ndarray, tuple, xrange, array.array]:
+            return list(value)
+        elif isinstance(value, Vector):
+            return list(value.toArray())
+        else:
+            raise TypeError("Could not convert %s to list" % value)
+
+    @staticmethod
+    def toListFloat(value):
+        """
+        Convert a value to list of floats, if possible.
+        """
+        if TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._is_numeric(v), value)):
+                return [float(v) for v in value]
+        raise TypeError("Could not convert %s to list of floats" % value)
+
+    @staticmethod
+    def toListInt(value):
+        """
+        Convert a value to list of ints, if possible.
+        """
+        if TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._is_integer(v), value)):
+                return [int(v) for v in value]
+        raise TypeError("Could not convert %s to list of ints" % value)
+
+    @staticmethod
+    def toListString(value):
+        """
+        Convert a value to list of strings, if possible.
+        """
+        if TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._can_convert_to_string(v), value)):
+                return [TypeConverters.toString(v) for v in value]
+        raise TypeError("Could not convert %s to list of strings" % value)
+
+    @staticmethod
+    def toVector(value):
+        """
+        Convert a value to a MLlib Vector, if possible.
+        """
+        if isinstance(value, Vector):
+            return value
+        elif TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._is_numeric(v), value)):
+                return DenseVector(value)
+        raise TypeError("Could not convert %s to vector" % value)
+
+    @staticmethod
+    def toFloat(value):
+        """
+        Convert a value to a float, if possible.
+        """
+        if TypeConverters._is_numeric(value):
+            return float(value)
+        else:
+            raise TypeError("Could not convert %s to float" % value)
+
+    @staticmethod
+    def toInt(value):
+        """
+        Convert a value to an int, if possible.
+        """
+        if TypeConverters._is_integer(value):
+            return int(value)
+        else:
+            raise TypeError("Could not convert %s to int" % value)
+
+    @staticmethod
+    def toString(value):
+        """
+        Convert a value to a string, if possible.
+        """
+        if isinstance(value, basestring):
+            return value
+        elif type(value) in [np.string_, np.str_]:
+            return str(value)
+        elif type(value) == np.unicode_:
+            return unicode(value)
+        else:
+            raise TypeError("Could not convert %s to string type" % type(value))
+
+    @staticmethod
+    def toBoolean(value):
+        """
+        Convert a value to a boolean, if possible.
+        """
+        if type(value) == bool:
+            return value
+        else:
+            raise TypeError("Boolean Param requires value of type bool. Found %s." % type(value))
+
+
 class Params(Identifiable):
     """
     Components that take parameters. This also provides an internal
@@ -275,23 +431,12 @@ class Params(Identifiable):
         """
         for param, value in kwargs.items():
             p = getattr(self, param)
-            if p.expectedType is None or type(value) == p.expectedType or value is None:
-                self._paramMap[getattr(self, param)] = value
-            else:
+            if value is not None:
                 try:
-                    # Try and do "safe" conversions that don't lose information
-                    if p.expectedType == float:
-                        self._paramMap[getattr(self, param)] = float(value)
-                    # Python 3 unified long & int
-                    elif p.expectedType == int and type(value).__name__ == 'long':
-                        self._paramMap[getattr(self, param)] = value
-                    else:
-                        raise Exception(
-                            "Provided type {0} incompatible with type {1} for param {2}"
-                            .format(type(value), p.expectedType, p))
-                except ValueError:
-                    raise Exception(("Failed to convert {0} to type {1} for param {2}"
-                                     .format(type(value), p.expectedType, p)))
+                    value = p.typeConverter(value)
+                except TypeError as e:
+                    raise TypeError('Invalid param value given for param "%s". %s' % (p.name, e))
+            self._paramMap[p] = value
         return self
 
     def _setDefault(self, **kwargs):

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/param/_shared_params_code_gen.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 5e297b8..7dd2937 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -38,7 +38,7 @@ header = """#
 # python _shared_params_code_gen.py > shared.py
 
 
-def _gen_param_header(name, doc, defaultValueStr, expectedType):
+def _gen_param_header(name, doc, defaultValueStr, typeConverter):
     """
     Generates the header part for shared variables
 
@@ -50,7 +50,7 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType):
     Mixin for param $name: $doc
     """
 
-    $name = Param(Params._dummy(), "$name", "$doc", $expectedType)
+    $name = Param(Params._dummy(), "$name", "$doc", typeConverter=$typeConverter)
 
     def __init__(self):
         super(Has$Name, self).__init__()'''
@@ -60,15 +60,14 @@ def _gen_param_header(name, doc, defaultValueStr, expectedType):
         self._setDefault($name=$defaultValueStr)'''
 
     Name = name[0].upper() + name[1:]
-    expectedTypeName = str(expectedType)
-    if expectedType is not None:
-        expectedTypeName = expectedType.__name__
+    if typeConverter is None:
+        typeConverter = str(None)
     return template \
         .replace("$name", name) \
         .replace("$Name", Name) \
         .replace("$doc", doc) \
         .replace("$defaultValueStr", str(defaultValueStr)) \
-        .replace("$expectedType", expectedTypeName)
+        .replace("$typeConverter", typeConverter)
 
 
 def _gen_param_code(name, doc, defaultValueStr):
@@ -105,64 +104,73 @@ def _gen_param_code(name, doc, defaultValueStr):
 if __name__ == "__main__":
     print(header)
     print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n")
-    print("from pyspark.ml.param import Param, Params\n\n")
+    print("from pyspark.ml.param import *\n\n")
     shared = [
-        ("maxIter", "max number of iterations (>= 0).", None, int),
-        ("regParam", "regularization parameter (>= 0).", None, float),
-        ("featuresCol", "features column name.", "'features'", str),
-        ("labelCol", "label column name.", "'label'", str),
-        ("predictionCol", "prediction column name.", "'prediction'", str),
+        ("maxIter", "max number of iterations (>= 0).", None, "TypeConverters.toInt"),
+        ("regParam", "regularization parameter (>= 0).", None, "TypeConverters.toFloat"),
+        ("featuresCol", "features column name.", "'features'", "TypeConverters.toString"),
+        ("labelCol", "label column name.", "'label'", "TypeConverters.toString"),
+        ("predictionCol", "prediction column name.", "'prediction'", "TypeConverters.toString"),
         ("probabilityCol", "Column name for predicted class conditional probabilities. " +
          "Note: Not all models output well-calibrated probability estimates! These probabilities " +
-         "should be treated as confidences, not precise probabilities.", "'probability'", str),
+         "should be treated as confidences, not precise probabilities.", "'probability'",
+         "TypeConverters.toString"),
         ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", "'rawPrediction'",
-         str),
-        ("inputCol", "input column name.", None, str),
-        ("inputCols", "input column names.", None, None),
-        ("outputCol", "output column name.", "self.uid + '__output'", str),
-        ("numFeatures", "number of features.", None, int),
+         "TypeConverters.toString"),
+        ("inputCol", "input column name.", None, "TypeConverters.toString"),
+        ("inputCols", "input column names.", None, "TypeConverters.toListString"),
+        ("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"),
+        ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
-         "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None, int),
-        ("seed", "random seed.", "hash(type(self).__name__)", int),
-        ("tol", "the convergence tolerance for iterative algorithms.", None, float),
-        ("stepSize", "Step size to be used for each iteration of optimization.", None, float),
+         "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None,
+         "TypeConverters.toInt"),
+        ("seed", "random seed.", "hash(type(self).__name__)", "TypeConverters.toInt"),
+        ("tol", "the convergence tolerance for iterative algorithms.", None,
+         "TypeConverters.toFloat"),
+        ("stepSize", "Step size to be used for each iteration of optimization.", None,
+         "TypeConverters.toFloat"),
         ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
          "out rows with bad values), or error (which will throw an errror). More options may be " +
-         "added later.", None, str),
+         "added later.", None, "TypeConverters.toBoolean"),
         ("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " +
-         "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0", float),
-        ("fitIntercept", "whether to fit an intercept term.", "True", bool),
+         "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", "0.0",
+         "TypeConverters.toFloat"),
+        ("fitIntercept", "whether to fit an intercept term.", "True", "TypeConverters.toBoolean"),
         ("standardization", "whether to standardize the training features before fitting the " +
-         "model.", "True", bool),
+         "model.", "True", "TypeConverters.toBoolean"),
         ("thresholds", "Thresholds in multi-class classification to adjust the probability of " +
          "predicting each class. Array must have length equal to the number of classes, with " +
          "values >= 0. The class with largest value p/t is predicted, where p is the original " +
-         "probability of that class and t is the class' threshold.", None, None),
+         "probability of that class and t is the class' threshold.", None,
+         "TypeConverters.toListFloat"),
         ("weightCol", "weight column name. If this is not set or empty, we treat " +
-         "all instance weights as 1.0.", None, str),
+         "all instance weights as 1.0.", None, "TypeConverters.toString"),
         ("solver", "the solver algorithm for optimization. If this is not set or empty, " +
-         "default value is 'auto'.", "'auto'", str)]
+         "default value is 'auto'.", "'auto'", "TypeConverters.toString")]
 
     code = []
-    for name, doc, defaultValueStr, expectedType in shared:
-        param_code = _gen_param_header(name, doc, defaultValueStr, expectedType)
+    for name, doc, defaultValueStr, typeConverter in shared:
+        param_code = _gen_param_header(name, doc, defaultValueStr, typeConverter)
         code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr))
 
     decisionTreeParams = [
         ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " +
-         "depth 1 means 1 internal node + 2 leaf nodes."),
+         "depth 1 means 1 internal node + 2 leaf nodes.", "TypeConverters.toInt"),
         ("maxBins", "Max number of bins for" +
          " discretizing continuous features.  Must be >=2 and >= number of categories for any" +
-         " categorical feature."),
+         " categorical feature.", "TypeConverters.toInt"),
         ("minInstancesPerNode", "Minimum number of instances each child must have after split. " +
          "If a split causes the left or right child to have fewer than minInstancesPerNode, the " +
-         "split will be discarded as invalid. Should be >= 1."),
-        ("minInfoGain", "Minimum information gain for a split to be considered at a tree node."),
-        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation."),
+         "split will be discarded as invalid. Should be >= 1.", "TypeConverters.toInt"),
+        ("minInfoGain", "Minimum information gain for a split to be considered at a tree node.",
+         "TypeConverters.toFloat"),
+        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.",
+         "TypeConverters.toInt"),
         ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " +
          "instances with nodes. If true, the algorithm will cache node IDs for each instance. " +
          "Caching can speed up training of deeper trees. Users can set how often should the " +
-         "cache be checkpointed or disable it by setting checkpointInterval.")]
+         "cache be checkpointed or disable it by setting checkpointInterval.",
+         "TypeConverters.toBoolean")]
 
     decisionTreeCode = '''class DecisionTreeParams(Params):
     """
@@ -175,9 +183,12 @@ if __name__ == "__main__":
         super(DecisionTreeParams, self).__init__()'''
     dtParamMethods = ""
     dummyPlaceholders = ""
-    paramTemplate = """$name = Param($owner, "$name", "$doc")"""
-    for name, doc in decisionTreeParams:
-        variable = paramTemplate.replace("$name", name).replace("$doc", doc)
+    paramTemplate = """$name = Param($owner, "$name", "$doc", typeConverter=$typeConverterStr)"""
+    for name, doc, typeConverterStr in decisionTreeParams:
+        if typeConverterStr is None:
+            typeConverterStr = str(None)
+        variable = paramTemplate.replace("$name", name).replace("$doc", doc) \
+            .replace("$typeConverterStr", typeConverterStr)
         dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n    "
         dtParamMethods += _gen_param_code(name, doc, None) + "\n"
     code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" +

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/param/shared.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index db4a8a5..83fbd59 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -17,7 +17,7 @@
 
 # DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.
 
-from pyspark.ml.param import Param, Params
+from pyspark.ml.param import *
 
 
 class HasMaxIter(Params):
@@ -25,7 +25,7 @@ class HasMaxIter(Params):
     Mixin for param maxIter: max number of iterations (>= 0).
     """
 
-    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).", int)
+    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasMaxIter, self).__init__()
@@ -49,7 +49,7 @@ class HasRegParam(Params):
     Mixin for param regParam: regularization parameter (>= 0).
     """
 
-    regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).", float)
+    regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasRegParam, self).__init__()
@@ -73,7 +73,7 @@ class HasFeaturesCol(Params):
     Mixin for param featuresCol: features column name.
     """
 
-    featuresCol = Param(Params._dummy(), "featuresCol", "features column name.", str)
+    featuresCol = Param(Params._dummy(), "featuresCol", "features column name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasFeaturesCol, self).__init__()
@@ -98,7 +98,7 @@ class HasLabelCol(Params):
     Mixin for param labelCol: label column name.
     """
 
-    labelCol = Param(Params._dummy(), "labelCol", "label column name.", str)
+    labelCol = Param(Params._dummy(), "labelCol", "label column name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasLabelCol, self).__init__()
@@ -123,7 +123,7 @@ class HasPredictionCol(Params):
     Mixin for param predictionCol: prediction column name.
     """
 
-    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.", str)
+    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasPredictionCol, self).__init__()
@@ -148,7 +148,7 @@ class HasProbabilityCol(Params):
     Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
     """
 
-    probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", str)
+    probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasProbabilityCol, self).__init__()
@@ -173,7 +173,7 @@ class HasRawPredictionCol(Params):
     Mixin for param rawPredictionCol: raw prediction (a.k.a. confidence) column name.
     """
 
-    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", str)
+    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasRawPredictionCol, self).__init__()
@@ -198,7 +198,7 @@ class HasInputCol(Params):
     Mixin for param inputCol: input column name.
     """
 
-    inputCol = Param(Params._dummy(), "inputCol", "input column name.", str)
+    inputCol = Param(Params._dummy(), "inputCol", "input column name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasInputCol, self).__init__()
@@ -222,7 +222,7 @@ class HasInputCols(Params):
     Mixin for param inputCols: input column names.
     """
 
-    inputCols = Param(Params._dummy(), "inputCols", "input column names.", None)
+    inputCols = Param(Params._dummy(), "inputCols", "input column names.", typeConverter=TypeConverters.toListString)
 
     def __init__(self):
         super(HasInputCols, self).__init__()
@@ -246,7 +246,7 @@ class HasOutputCol(Params):
     Mixin for param outputCol: output column name.
     """
 
-    outputCol = Param(Params._dummy(), "outputCol", "output column name.", str)
+    outputCol = Param(Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasOutputCol, self).__init__()
@@ -271,7 +271,7 @@ class HasNumFeatures(Params):
     Mixin for param numFeatures: number of features.
     """
 
-    numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", int)
+    numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasNumFeatures, self).__init__()
@@ -295,7 +295,7 @@ class HasCheckpointInterval(Params):
     Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
     """
 
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", int)
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
@@ -319,7 +319,7 @@ class HasSeed(Params):
     Mixin for param seed: random seed.
     """
 
-    seed = Param(Params._dummy(), "seed", "random seed.", int)
+    seed = Param(Params._dummy(), "seed", "random seed.", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasSeed, self).__init__()
@@ -344,7 +344,7 @@ class HasTol(Params):
     Mixin for param tol: the convergence tolerance for iterative algorithms.
     """
 
-    tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", float)
+    tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasTol, self).__init__()
@@ -368,7 +368,7 @@ class HasStepSize(Params):
     Mixin for param stepSize: Step size to be used for each iteration of optimization.
     """
 
-    stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization.", float)
+    stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for each iteration of optimization.", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasStepSize, self).__init__()
@@ -392,7 +392,7 @@ class HasHandleInvalid(Params):
     Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
     """
 
-    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", str)
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.", typeConverter=TypeConverters.toBoolean)
 
     def __init__(self):
         super(HasHandleInvalid, self).__init__()
@@ -416,7 +416,7 @@ class HasElasticNetParam(Params):
     Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
     """
 
-    elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", float)
+    elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasElasticNetParam, self).__init__()
@@ -441,7 +441,7 @@ class HasFitIntercept(Params):
     Mixin for param fitIntercept: whether to fit an intercept term.
     """
 
-    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", bool)
+    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an intercept term.", typeConverter=TypeConverters.toBoolean)
 
     def __init__(self):
         super(HasFitIntercept, self).__init__()
@@ -466,7 +466,7 @@ class HasStandardization(Params):
     Mixin for param standardization: whether to standardize the training features before fitting the model.
     """
 
-    standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", bool)
+    standardization = Param(Params._dummy(), "standardization", "whether to standardize the training features before fitting the model.", typeConverter=TypeConverters.toBoolean)
 
     def __init__(self):
         super(HasStandardization, self).__init__()
@@ -491,7 +491,7 @@ class HasThresholds(Params):
     Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
     """
 
-    thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", None)
+    thresholds = Param(Params._dummy(), "thresholds", "Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.", typeConverter=TypeConverters.toListFloat)
 
     def __init__(self):
         super(HasThresholds, self).__init__()
@@ -515,7 +515,7 @@ class HasWeightCol(Params):
     Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0.
     """
 
-    weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", str)
+    weightCol = Param(Params._dummy(), "weightCol", "weight column name. If this is not set or empty, we treat all instance weights as 1.0.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasWeightCol, self).__init__()
@@ -539,7 +539,7 @@ class HasSolver(Params):
     Mixin for param solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.
     """
 
-    solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", str)
+    solver = Param(Params._dummy(), "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasSolver, self).__init__()
@@ -564,12 +564,12 @@ class DecisionTreeParams(Params):
     Mixin for Decision Tree parameters.
     """
 
-    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.")
-    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.")
-    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.")
-    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.")
-    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.")
-    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.")
+    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", typeConverter=TypeConverters.toInt)
+    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.", typeConverter=TypeConverters.toInt)
+    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.", typeConverter=TypeConverters.toInt)
+    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.", typeConverter=TypeConverters.toFloat)
+    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation.", typeConverter=TypeConverters.toInt)
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.", typeConverter=TypeConverters.toBoolean)
     
 
     def __init__(self):

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/recommendation.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py
index de4c267..7c7a1b6 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -100,16 +100,23 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha
     .. versionadded:: 1.4.0
     """
 
-    rank = Param(Params._dummy(), "rank", "rank of the factorization")
-    numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks")
-    numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks")
-    implicitPrefs = Param(Params._dummy(), "implicitPrefs", "whether to use implicit preference")
-    alpha = Param(Params._dummy(), "alpha", "alpha for implicit preference")
-    userCol = Param(Params._dummy(), "userCol", "column name for user ids")
-    itemCol = Param(Params._dummy(), "itemCol", "column name for item ids")
-    ratingCol = Param(Params._dummy(), "ratingCol", "column name for ratings")
+    rank = Param(Params._dummy(), "rank", "rank of the factorization",
+                 typeConverter=TypeConverters.toInt)
+    numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user blocks",
+                          typeConverter=TypeConverters.toInt)
+    numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item blocks",
+                          typeConverter=TypeConverters.toInt)
+    implicitPrefs = Param(Params._dummy(), "implicitPrefs", "whether to use implicit preference",
+                          TypeConverters.toBoolean)
+    alpha = Param(Params._dummy(), "alpha", "alpha for implicit preference",
+                  typeConverter=TypeConverters.toFloat)
+    userCol = Param(Params._dummy(), "userCol", "column name for user ids", TypeConverters.toString)
+    itemCol = Param(Params._dummy(), "itemCol", "column name for item ids", TypeConverters.toString)
+    ratingCol = Param(Params._dummy(), "ratingCol", "column name for ratings",
+                      TypeConverters.toString)
     nonnegative = Param(Params._dummy(), "nonnegative",
-                        "whether to use nonnegative constraint for least squares")
+                        "whether to use nonnegative constraint for least squares",
+                        TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10,

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 664a44b..8982608 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -189,10 +189,11 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
     isotonic = \
         Param(Params._dummy(), "isotonic",
               "whether the output sequence should be isotonic/increasing (true) or" +
-              "antitonic/decreasing (false).")
+              "antitonic/decreasing (false).", typeConverter=TypeConverters.toBoolean)
     featureIndex = \
         Param(Params._dummy(), "featureIndex",
-              "The index of the feature if featuresCol is a vector column, no effect otherwise.")
+              "The index of the feature if featuresCol is a vector column, no effect otherwise.",
+              typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
@@ -278,7 +279,8 @@ class TreeEnsembleParams(DecisionTreeParams):
     """
 
     subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of the training data " +
-                            "used for learning each decision tree, in range (0, 1].")
+                            "used for learning each decision tree, in range (0, 1].",
+                            typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(TreeEnsembleParams, self).__init__()
@@ -335,11 +337,13 @@ class RandomForestParams(TreeEnsembleParams):
     """
 
     supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", "log2"]
-    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).")
+    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train (>= 1).",
+                     typeConverter=TypeConverters.toInt)
     featureSubsetStrategy = \
         Param(Params._dummy(), "featureSubsetStrategy",
               "The number of features to consider for splits at each tree node. Supported " +
-              "options: " + ", ".join(supportedFeatureSubsetStrategies))
+              "options: " + ", ".join(supportedFeatureSubsetStrategies),
+              typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(RandomForestParams, self).__init__()
@@ -653,7 +657,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
 
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize (case-insensitive). " +
-                     "Supported options: " + ", ".join(GBTParams.supportedLossTypes))
+                     "Supported options: " + ", ".join(GBTParams.supportedLossTypes),
+                     typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
@@ -767,14 +772,16 @@ class AFTSurvivalRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi
     censorCol = Param(Params._dummy(), "censorCol",
                       "censor column name. The value of this column could be 0 or 1. " +
                       "If the value is 1, it means the event has occurred i.e. " +
-                      "uncensored; otherwise censored.")
+                      "uncensored; otherwise censored.", typeConverter=TypeConverters.toString)
     quantileProbabilities = \
         Param(Params._dummy(), "quantileProbabilities",
               "quantile probabilities array. Values of the quantile probabilities array " +
-              "should be in the range (0, 1) and the array should be non-empty.")
+              "should be in the range (0, 1) and the array should be non-empty.",
+              typeConverter=TypeConverters.toListFloat)
     quantilesCol = Param(Params._dummy(), "quantilesCol",
                          "quantiles column name. This column will output quantiles of " +
-                         "corresponding quantileProbabilities if it is set.")
+                         "corresponding quantileProbabilities if it is set.",
+                         typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 211248e..2fa5da7 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -18,8 +18,11 @@
 """
 Unit tests for Spark ML Python APIs.
 """
-
+import array
 import sys
+if sys.version > '3':
+    xrange = range
+
 try:
     import xmlrunner
 except ImportError:
@@ -36,19 +39,20 @@ else:
 
 from shutil import rmtree
 import tempfile
+import numpy as np
 
 from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.clustering import KMeans
 from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.ml.feature import *
-from pyspark.ml.param import Param, Params
+from pyspark.ml.param import Param, Params, TypeConverters
 from pyspark.ml.param.shared import HasMaxIter, HasInputCol, HasSeed
 from pyspark.ml.regression import LinearRegression
 from pyspark.ml.tuning import *
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaWrapper
-from pyspark.mllib.linalg import DenseVector
+from pyspark.mllib.linalg import DenseVector, SparseVector
 from pyspark.sql import DataFrame, SQLContext, Row
 from pyspark.sql.functions import rand
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
@@ -104,20 +108,65 @@ class ParamTypeConversionTests(PySparkTestCase):
     Test that param type conversion happens.
     """
 
-    def test_int_to_float(self):
-        from pyspark.mllib.linalg import Vectors
-        df = self.sc.parallelize([
-            Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
-        lr = LogisticRegression(elasticNetParam=0)
-        lr.fit(df)
-        lr.setElasticNetParam(0)
-        lr.fit(df)
-
-    def test_invalid_to_float(self):
-        from pyspark.mllib.linalg import Vectors
-        self.assertRaises(Exception, lambda: LogisticRegression(elasticNetParam="happy"))
-        lr = LogisticRegression(elasticNetParam=0)
-        self.assertRaises(Exception, lambda: lr.setElasticNetParam("panda"))
+    def test_int(self):
+        lr = LogisticRegression(maxIter=5.0)
+        self.assertEqual(lr.getMaxIter(), 5)
+        self.assertTrue(type(lr.getMaxIter()) == int)
+        self.assertRaises(TypeError, lambda: LogisticRegression(maxIter="notAnInt"))
+        self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
+
+    def test_float(self):
+        lr = LogisticRegression(tol=1)
+        self.assertEqual(lr.getTol(), 1.0)
+        self.assertTrue(type(lr.getTol()) == float)
+        self.assertRaises(TypeError, lambda: LogisticRegression(tol="notAFloat"))
+
+    def test_vector(self):
+        ewp = ElementwiseProduct(scalingVec=[1, 3])
+        self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
+        ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
+        self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
+        self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
+
+    def test_list(self):
+        l = [0, 1]
+        for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l),
+                         array.array('l', l), xrange(2), tuple(l)]:
+            converted = TypeConverters.toList(lst_like)
+            self.assertEqual(type(converted), list)
+            self.assertListEqual(converted, l)
+
+    def test_list_int(self):
+        for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]),
+                        SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0),
+                        array.array('d', [1.0, 2.0])]:
+            vs = VectorSlicer(indices=indices)
+            self.assertListEqual(vs.getIndices(), [1, 2])
+            self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
+        self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
+
+    def test_list_float(self):
+        b = Bucketizer(splits=[1, 4])
+        self.assertEqual(b.getSplits(), [1.0, 4.0])
+        self.assertTrue(all([type(v) == float for v in b.getSplits()]))
+        self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
+
+    def test_list_string(self):
+        for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 'b'])]:
+            idx_to_string = IndexToString(labels=labels)
+            self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
+        self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
+
+    def test_string(self):
+        lr = LogisticRegression()
+        for col in ['features', u'features', np.str_('features')]:
+            lr.setFeaturesCol(col)
+            self.assertEqual(lr.getFeaturesCol(), 'features')
+        self.assertRaises(TypeError, lambda: LogisticRegression(featuresCol=2.3))
+
+    def test_bool(self):
+        self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
+        self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
 
 
 class PipelineTests(PySparkTestCase):

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/tuning.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 77af009..a528d22 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -20,7 +20,7 @@ import numpy as np
 
 from pyspark import since
 from pyspark.ml import Estimator, Model
-from pyspark.ml.param import Params, Param
+from pyspark.ml.param import Params, Param, TypeConverters
 from pyspark.ml.param.shared import HasSeed
 from pyspark.ml.util import keyword_only
 from pyspark.sql.functions import rand
@@ -121,7 +121,8 @@ class CrossValidator(Estimator, HasSeed):
     evaluator = Param(
         Params._dummy(), "evaluator",
         "evaluator used to select hyper-parameters that maximize the cross-validated metric")
-    numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation")
+    numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross validation",
+                     typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, estimator=None, estimatorParamMaps=None, evaluator=None, numFolds=3,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org