You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ml...@apache.org on 2018/01/26 21:49:01 UTC
spark git commit: Revert "[SPARK-22797][PYSPARK] Bucketizer support
multi-column"
Repository: spark
Updated Branches:
refs/heads/branch-2.3 ca3613be2 -> f5911d489
Revert "[SPARK-22797][PYSPARK] Bucketizer support multi-column"
This reverts commit ab1b5d921b395cb7df3a3a2c4a7e5778d98e6f01.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f5911d48
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f5911d48
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f5911d48
Branch: refs/heads/branch-2.3
Commit: f5911d4894700eb48f794133cbd363bf3b7c8c8e
Parents: ca3613b
Author: Nick Pentreath <ni...@za.ibm.com>
Authored: Fri Jan 26 23:17:45 2018 +0200
Committer: Nick Pentreath <ni...@za.ibm.com>
Committed: Fri Jan 26 23:17:45 2018 +0200
----------------------------------------------------------------------
python/pyspark/ml/feature.py | 105 ++++++++-----------------------
python/pyspark/ml/param/__init__.py | 10 ---
python/pyspark/ml/tests.py | 9 ---
3 files changed, 25 insertions(+), 99 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/f5911d48/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index fdc7787..da85ba7 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -317,33 +317,26 @@ class BucketedRandomProjectionLSHModel(LSHModel, JavaMLReadable, JavaMLWritable)
@inherit_doc
-class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOutputCols,
- HasHandleInvalid, JavaMLReadable, JavaMLWritable):
- """
- Maps a column of continuous features to a column of feature buckets. Since 2.3.0,
- :py:class:`Bucketizer` can map multiple columns at once by setting the :py:attr:`inputCols`
- parameter. Note that when both the :py:attr:`inputCol` and :py:attr:`inputCols` parameters
- are set, an Exception will be thrown. The :py:attr:`splits` parameter is only used for single
- column usage, and :py:attr:`splitsArray` is for multiple columns.
-
- >>> values = [(0.1, 0.0), (0.4, 1.0), (1.2, 1.3), (1.5, float("nan")),
- ... (float("nan"), 1.0), (float("nan"), 0.0)]
- >>> df = spark.createDataFrame(values, ["values1", "values2"])
+class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid,
+ JavaMLReadable, JavaMLWritable):
+ """
+ Maps a column of continuous features to a column of feature buckets.
+
+ >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
+ >>> df = spark.createDataFrame(values, ["values"])
>>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
- ... inputCol="values1", outputCol="buckets")
- >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df.select("values1"))
- >>> bucketed.show(truncate=False)
- +-------+-------+
- |values1|buckets|
- +-------+-------+
- |0.1 |0.0 |
- |0.4 |0.0 |
- |1.2 |1.0 |
- |1.5 |2.0 |
- |NaN |3.0 |
- |NaN |3.0 |
- +-------+-------+
- ...
+ ... inputCol="values", outputCol="buckets")
+ >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()
+ >>> len(bucketed)
+ 6
+ >>> bucketed[0].buckets
+ 0.0
+ >>> bucketed[1].buckets
+ 0.0
+ >>> bucketed[2].buckets
+ 1.0
+ >>> bucketed[3].buckets
+ 2.0
>>> bucketizer.setParams(outputCol="b").transform(df).head().b
0.0
>>> bucketizerPath = temp_path + "/bucketizer"
@@ -354,22 +347,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
>>> bucketed = bucketizer.setHandleInvalid("skip").transform(df).collect()
>>> len(bucketed)
4
- >>> bucketizer2 = Bucketizer(splitsArray=
- ... [[-float("inf"), 0.5, 1.4, float("inf")], [-float("inf"), 0.5, float("inf")]],
- ... inputCols=["values1", "values2"], outputCols=["buckets1", "buckets2"])
- >>> bucketed2 = bucketizer2.setHandleInvalid("keep").transform(df)
- >>> bucketed2.show(truncate=False)
- +-------+-------+--------+--------+
- |values1|values2|buckets1|buckets2|
- +-------+-------+--------+--------+
- |0.1 |0.0 |0.0 |0.0 |
- |0.4 |1.0 |0.0 |1.0 |
- |1.2 |1.3 |1.0 |1.0 |
- |1.5 |NaN |2.0 |2.0 |
- |NaN |1.0 |3.0 |1.0 |
- |NaN |0.0 |3.0 |0.0 |
- +-------+-------+--------+--------+
- ...
.. versionadded:: 1.4.0
"""
@@ -386,30 +363,14 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
"Options are 'skip' (filter out rows with invalid values), " +
- "'error' (throw an error), or 'keep' (keep invalid values in a " +
- "special additional bucket). Note that in the multiple column " +
- "case, the invalid handling is applied to all columns. That said " +
- "for 'error' it will throw an error if any invalids are found in " +
- "any column, for 'skip' it will skip rows with any invalids in " +
- "any columns, etc.",
+ "'error' (throw an error), or 'keep' (keep invalid values in a special " +
+ "additional bucket).",
typeConverter=TypeConverters.toString)
- splitsArray = Param(Params._dummy(), "splitsArray", "The array of split points for mapping " +
- "continuous features into buckets for multiple columns. For each input " +
- "column, with n+1 splits, there are n buckets. A bucket defined by " +
- "splits x,y holds values in the range [x,y) except the last bucket, " +
- "which also includes y. The splits should be of length >= 3 and " +
- "strictly increasing. Values at -inf, inf must be explicitly provided " +
- "to cover all Double values; otherwise, values outside the splits " +
- "specified will be treated as errors.",
- typeConverter=TypeConverters.toListListFloat)
-
@keyword_only
- def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error",
- splitsArray=None, inputCols=None, outputCols=None):
+ def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
"""
- __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
- splitsArray=None, inputCols=None, outputCols=None)
+ __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
"""
super(Bucketizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
@@ -419,11 +380,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
@keyword_only
@since("1.4.0")
- def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error",
- splitsArray=None, inputCols=None, outputCols=None):
+ def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
"""
- setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error", \
- splitsArray=None, inputCols=None, outputCols=None)
+ setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
Sets params for this Bucketizer.
"""
kwargs = self._input_kwargs
@@ -443,20 +402,6 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasInputCols, HasOu
"""
return self.getOrDefault(self.splits)
- @since("2.3.0")
- def setSplitsArray(self, value):
- """
- Sets the value of :py:attr:`splitsArray`.
- """
- return self._set(splitsArray=value)
-
- @since("2.3.0")
- def getSplitsArray(self):
- """
- Gets the array of split points or its default value.
- """
- return self.getOrDefault(self.splitsArray)
-
@inherit_doc
class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
http://git-wip-us.apache.org/repos/asf/spark/blob/f5911d48/python/pyspark/ml/param/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
index 5b6b702..043c25c 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -135,16 +135,6 @@ class TypeConverters(object):
raise TypeError("Could not convert %s to list of floats" % value)
@staticmethod
- def toListListFloat(value):
- """
- Convert a value to list of list of floats, if possible.
- """
- if TypeConverters._can_convert_to_list(value):
- value = TypeConverters.toList(value)
- return [TypeConverters.toListFloat(v) for v in value]
- raise TypeError("Could not convert %s to list of list of floats" % value)
-
- @staticmethod
def toListInt(value):
"""
Convert a value to list of ints, if possible.
http://git-wip-us.apache.org/repos/asf/spark/blob/f5911d48/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index b8bddbd..1af2b91 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -238,15 +238,6 @@ class ParamTypeConversionTests(PySparkTestCase):
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept=1))
self.assertRaises(TypeError, lambda: LogisticRegression(fitIntercept="false"))
- def test_list_list_float(self):
- b = Bucketizer(splitsArray=[[-0.1, 0.5, 3], [-5, 1.5]])
- self.assertEqual(b.getSplitsArray(), [[-0.1, 0.5, 3.0], [-5.0, 1.5]])
- self.assertTrue(all([type(v) == list for v in b.getSplitsArray()]))
- self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[0]]))
- self.assertTrue(all([type(v) == float for v in b.getSplitsArray()[1]]))
- self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=["a", 1.0]))
- self.assertRaises(TypeError, lambda: Bucketizer(splitsArray=[[-5, 1.5], ["a", 1.0]]))
-
class PipelineTests(PySparkTestCase):
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org