You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ml...@apache.org on 2017/08/21 12:36:00 UTC

spark git commit: [SPARK-21468][PYSPARK][ML] Python API for FeatureHasher

Repository: spark
Updated Branches:
  refs/heads/master b3a07526f -> 988b84d7e


[SPARK-21468][PYSPARK][ML] Python API for FeatureHasher

Add Python API for `FeatureHasher` transformer.

## How was this patch tested?

New doc test.

Author: Nick Pentreath <ni...@za.ibm.com>

Closes #18970 from MLnick/SPARK-21468-pyspark-hasher.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/988b84d7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/988b84d7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/988b84d7

Branch: refs/heads/master
Commit: 988b84d7ed43bea2616527ff050dffcf20548ab2
Parents: b3a0752
Author: Nick Pentreath <ni...@za.ibm.com>
Authored: Mon Aug 21 14:35:38 2017 +0200
Committer: Nick Pentreath <ni...@za.ibm.com>
Committed: Mon Aug 21 14:35:38 2017 +0200

----------------------------------------------------------------------
 .../apache/spark/ml/feature/FeatureHasher.scala | 16 ++--
 python/pyspark/ml/feature.py                    | 77 ++++++++++++++++++++
 2 files changed, 85 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/988b84d7/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index d22bf16..4b91fa9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -64,17 +64,17 @@ import org.apache.spark.util.collection.OpenHashMap
  *   ).toDF("real", "bool", "stringNum", "string")
  *
  *   val hasher = new FeatureHasher()
- *    .setInputCols("real", "bool", "stringNum", "num")
+ *    .setInputCols("real", "bool", "stringNum", "string")
  *    .setOutputCol("features")
  *
- *   hasher.transform(df).show()
+ *   hasher.transform(df).show(false)
  *
- *   +----+-----+---------+------+--------------------+
- *   |real| bool|stringNum|string|            features|
- *   +----+-----+---------+------+--------------------+
- *   | 2.0| true|        1|   foo|(262144,[51871,63...|
- *   | 3.0|false|        2|   bar|(262144,[6031,806...|
- *   +----+-----+---------+------+--------------------+
+ *   +----+-----+---------+------+------------------------------------------------------+
+ *   |real|bool |stringNum|string|features                                              |
+ *   +----+-----+---------+------+------------------------------------------------------+
+ *   |2.0 |true |1        |foo   |(262144,[51871,63643,174475,253195],[1.0,1.0,2.0,1.0])|
+ *   |3.0 |false|2        |bar   |(262144,[6031,80619,140467,174475],[1.0,1.0,1.0,3.0]) |
+ *   +----+-----+---------+------+------------------------------------------------------+
  * }}}
  */
 @Experimental

http://git-wip-us.apache.org/repos/asf/spark/blob/988b84d7/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 54b4026..050537b 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -34,6 +34,7 @@ __all__ = ['Binarizer',
            'CountVectorizer', 'CountVectorizerModel',
            'DCT',
            'ElementwiseProduct',
+           'FeatureHasher',
            'HashingTF',
            'IDF', 'IDFModel',
            'Imputer', 'ImputerModel',
@@ -697,6 +698,82 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReada
 
 
 @inherit_doc
+class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, JavaMLReadable,
+                    JavaMLWritable):
+    """
+    .. note:: Experimental
+
+    Feature hashing projects a set of categorical or numerical features into a feature vector of
+    specified dimension (typically substantially smaller than that of the original feature
+    space). This is done using the hashing trick (https://en.wikipedia.org/wiki/Feature_hashing)
+    to map features to indices in the feature vector.
+
+    The FeatureHasher transformer operates on multiple columns. Each column may contain either
+    numeric or categorical features. Behavior and handling of column data types is as follows:
+
+    * Numeric columns:
+        For numeric features, the hash value of the column name is used to map the
+        feature value to its index in the feature vector. Numeric features are never
+        treated as categorical, even when they are integers. You must explicitly
+        convert numeric columns containing categorical features to strings first.
+
+    * String columns:
+        For categorical features, the hash value of the string "column_name=value"
+        is used to map to the vector index, with an indicator value of `1.0`.
+        Thus, categorical features are "one-hot" encoded
+        (similarly to using :py:class:`OneHotEncoder` with `dropLast=false`).
+
+    * Boolean columns:
+        Boolean values are treated in the same way as string columns. That is,
+        boolean features are represented as "column_name=true" or "column_name=false",
+        with an indicator value of `1.0`.
+
+    Null (missing) values are ignored (implicitly zero in the resulting feature vector).
+
+    Since a simple modulo is used to transform the hash function to a vector index,
+    it is advisable to use a power of two as the `numFeatures` parameter;
+    otherwise the features will not be mapped evenly to the vector indices.
+
+    >>> data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")]
+    >>> cols = ["real", "bool", "stringNum", "string"]
+    >>> df = spark.createDataFrame(data, cols)
+    >>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
+    >>> hasher.transform(df).head().features
+    SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
+    >>> hasherPath = temp_path + "/hasher"
+    >>> hasher.save(hasherPath)
+    >>> loadedHasher = FeatureHasher.load(hasherPath)
+    >>> loadedHasher.getNumFeatures() == hasher.getNumFeatures()
+    True
+    >>> loadedHasher.transform(df).head().features == hasher.transform(df).head().features
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    @keyword_only
+    def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+        """
+        __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        """
+        super(FeatureHasher, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
+        self._setDefault(numFeatures=1 << 18)
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+        """
+        setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        Sets params for this FeatureHasher.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+
+@inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,
                 JavaMLWritable):
     """


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org