You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ml...@apache.org on 2017/08/21 12:36:00 UTC
spark git commit: [SPARK-21468][PYSPARK][ML] Python API for
FeatureHasher
Repository: spark
Updated Branches:
refs/heads/master b3a07526f -> 988b84d7e
[SPARK-21468][PYSPARK][ML] Python API for FeatureHasher
Add Python API for `FeatureHasher` transformer.
## How was this patch tested?
New doc test.
Author: Nick Pentreath <ni...@za.ibm.com>
Closes #18970 from MLnick/SPARK-21468-pyspark-hasher.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/988b84d7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/988b84d7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/988b84d7
Branch: refs/heads/master
Commit: 988b84d7ed43bea2616527ff050dffcf20548ab2
Parents: b3a0752
Author: Nick Pentreath <ni...@za.ibm.com>
Authored: Mon Aug 21 14:35:38 2017 +0200
Committer: Nick Pentreath <ni...@za.ibm.com>
Committed: Mon Aug 21 14:35:38 2017 +0200
----------------------------------------------------------------------
.../apache/spark/ml/feature/FeatureHasher.scala | 16 ++--
python/pyspark/ml/feature.py | 77 ++++++++++++++++++++
2 files changed, 85 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/988b84d7/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index d22bf16..4b91fa9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -64,17 +64,17 @@ import org.apache.spark.util.collection.OpenHashMap
* ).toDF("real", "bool", "stringNum", "string")
*
* val hasher = new FeatureHasher()
- * .setInputCols("real", "bool", "stringNum", "num")
+ * .setInputCols("real", "bool", "stringNum", "string")
* .setOutputCol("features")
*
- * hasher.transform(df).show()
+ * hasher.transform(df).show(false)
*
- * +----+-----+---------+------+--------------------+
- * |real| bool|stringNum|string| features|
- * +----+-----+---------+------+--------------------+
- * | 2.0| true| 1| foo|(262144,[51871,63...|
- * | 3.0|false| 2| bar|(262144,[6031,806...|
- * +----+-----+---------+------+--------------------+
+ * +----+-----+---------+------+------------------------------------------------------+
+ * |real|bool |stringNum|string|features |
+ * +----+-----+---------+------+------------------------------------------------------+
+ * |2.0 |true |1 |foo |(262144,[51871,63643,174475,253195],[1.0,1.0,2.0,1.0])|
+ * |3.0 |false|2 |bar |(262144,[6031,80619,140467,174475],[1.0,1.0,1.0,3.0]) |
+ * +----+-----+---------+------+------------------------------------------------------+
* }}}
*/
@Experimental
http://git-wip-us.apache.org/repos/asf/spark/blob/988b84d7/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 54b4026..050537b 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -34,6 +34,7 @@ __all__ = ['Binarizer',
'CountVectorizer', 'CountVectorizerModel',
'DCT',
'ElementwiseProduct',
+ 'FeatureHasher',
'HashingTF',
'IDF', 'IDFModel',
'Imputer', 'ImputerModel',
@@ -697,6 +698,82 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada
@inherit_doc
+class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable,
+ JavaMLWritable):
+ """
+ .. note:: Experimental
+
+ Feature hashing projects a set of categorical or numerical features into a feature vector of
+ specified dimension (typically substantially smaller than that of the original feature
+ space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)
+ to map features to indices in the feature vector.
+
+ The FeatureHasher transformer operates on multiple columns. Each column may contain either
+ numeric or categorical features. Behavior and handling of column data types is as follows:
+
+ * Numeric columns:
+ For numeric features, the hash value of the column name is used to map the
+ feature value to its index in the feature vector. Numeric features are never
+ treated as categorical, even when they are integers. You must explicitly
+ convert numeric columns containing categorical features to strings first.
+
+ * String columns:
+ For categorical features, the hash value of the string "column_name=value"
+ is used to map to the vector index, with an indicator value of `1.0`.
+ Thus, categorical features are "one-hot" encoded
+ (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).
+
+ * Boolean columns:
+ Boolean values are treated in the same way as string columns. That is,
+ boolean features are represented as "column_name=true" or "column_name=false",
+ with an indicator value of `1.0`.
+
+ Null (missing) values are ignored (implicitly zero in the resulting feature vector).
+
+ Since a simple modulo is used to transform the hash function to a vector index,
+ it is advisable to use a power of two as the `numFeatures` parameter;
+ otherwise the features will not be mapped evenly to the vector indices.
+
+ >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
+ >>> cols = ["real", "bool", "stringNum", "string"]
+ >>> df = spark.createDataFrame(data, cols)
+ >>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
+ >>> hasher.transform(df).head().features
+ SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
+ >>> hasherPath = temp_path + "/hasher"
+ >>> hasher.save(hasherPath)
+ >>> loadedHasher = FeatureHasher.load(hasherPath)
+ >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()
+ True
+ >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features
+ True
+
+ .. versionadded:: 2.3.0
+ """
+
+ @keyword_only
+ def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+ """
+ __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+ """
+ super(FeatureHasher, self).__init__()
+ self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
+ self._setDefault(numFeatures=1 << 18)
+ kwargs = self._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ @since("2.3.0")
+ def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+ """
+ setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+ Sets params for this FeatureHasher.
+ """
+ kwargs = self._input_kwargs
+ return self._set(**kwargs)
+
+
+@inherit_doc
class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
JavaMLWritable):
"""
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org