You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by cu...@apache.org on 2018/03/23 22:59:03 UTC

spark git commit: [SPARK-23615][ML][PYSPARK] Add maxDF Parameter to Python CountVectorizer

Repository: spark
Updated Branches:
  refs/heads/master 95c03cbd2 -> a33655348


[SPARK-23615][ML][PYSPARK] Add maxDF Parameter to Python CountVectorizer

## What changes were proposed in this pull request?

The maxDF parameter is for filtering out frequently occurring terms. This param was recently added to the Scala CountVectorizer and needs to be added to Python also.

## How was this patch tested?

add test

Author: Huaxin Gao <hu...@us.ibm.com>

Closes #20777 from huaxingao/spark-23615.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a3365534
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a3365534
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a3365534

Branch: refs/heads/master
Commit: a33655348c4066d9c1d8ad2055aadfbc892ba7fd
Parents: 95c03cb
Author: Huaxin Gao <hu...@us.ibm.com>
Authored: Fri Mar 23 15:58:48 2018 -0700
Committer: Bryan Cutler <cu...@gmail.com>
Committed: Fri Mar 23 15:58:48 2018 -0700

----------------------------------------------------------------------
 .../spark/ml/feature/CountVectorizer.scala      | 20 +++++-----
 python/pyspark/ml/feature.py                    | 40 +++++++++++++++-----
 python/pyspark/ml/tests.py                      | 25 ++++++++++++
 3 files changed, 67 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a3365534/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 60a4f91..9e0ed43 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -70,19 +70,21 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
   def getMinDF: Double = $(minDF)
 
   /**
-   * Specifies the maximum number of different documents a term must appear in to be included
-   * in the vocabulary.
-   * If this is an integer greater than or equal to 1, this specifies the number of documents
-   * the term must appear in; if this is a double in [0,1), then this specifies the fraction of
-   * documents.
+   * Specifies the maximum number of different documents a term could appear in to be included
+   * in the vocabulary. A term that appears more than the threshold will be ignored. If this is an
+   * integer greater than or equal to 1, this specifies the maximum number of documents the term
+   * could appear in; if this is a double in [0,1), then this specifies the maximum fraction of
+   * documents the term could appear in.
    *
-   * Default: (2^64^) - 1
+   * Default: (2^63^) - 1
    * @group param
    */
   val maxDF: DoubleParam = new DoubleParam(this, "maxDF", "Specifies the maximum number of" +
-    " different documents a term must appear in to be included in the vocabulary." +
-    " If this is an integer >= 1, this specifies the number of documents the term must" +
-    " appear in; if this is a double in [0,1), then this specifies the fraction of documents.",
+    " different documents a term could appear in to be included in the vocabulary." +
+    " A term that appears more than the threshold will be ignored. If this is an integer >= 1," +
+    " this specifies the maximum number of documents the term could appear in;" +
+    " if this is a double in [0,1), then this specifies the maximum fraction of" +
+    " documents the term could appear in.",
     ParamValidators.gtEq(0.0))
 
   /** @group getParam */

http://git-wip-us.apache.org/repos/asf/spark/blob/a3365534/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index a1ceb7f..fcb0dfc 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -422,6 +422,14 @@ class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):
         " If this is an integer >= 1, this specifies the number of documents the term must" +
         " appear in; if this is a double in [0,1), then this specifies the fraction of documents." +
         " Default 1.0", typeConverter=TypeConverters.toFloat)
+    maxDF = Param(
+        Params._dummy(), "maxDF", "Specifies the maximum number of" +
+        " different documents a term could appear in to be included in the vocabulary." +
+        " A term that appears more than the threshold will be ignored. If this is an" +
+        " integer >= 1, this specifies the maximum number of documents the term could appear in;" +
+        " if this is a double in [0,1), then this specifies the maximum" +
+        " fraction of documents the term could appear in." +
+        " Default (2^63) - 1", typeConverter=TypeConverters.toFloat)
     vocabSize = Param(
         Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.",
         typeConverter=TypeConverters.toInt)
@@ -433,7 +441,7 @@ class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):
 
     def __init__(self, *args):
         super(_CountVectorizerParams, self).__init__(*args)
-        self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False)
+        self._setDefault(minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False)
 
     @since("1.6.0")
     def getMinTF(self):
@@ -449,6 +457,13 @@ class _CountVectorizerParams(JavaParams, HasInputCol, HasOutputCol):
         """
         return self.getOrDefault(self.minDF)
 
+    @since("2.4.0")
+    def getMaxDF(self):
+        """
+        Gets the value of maxDF or its default value.
+        """
+        return self.getOrDefault(self.maxDF)
+
     @since("1.6.0")
     def getVocabSize(self):
         """
@@ -513,11 +528,11 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav
     """
 
     @keyword_only
-    def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,
-                 outputCol=None):
+    def __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,
+                 inputCol=None, outputCol=None):
         """
-        __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,\
-                 outputCol=None)
+        __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,\
+                 inputCol=None,outputCol=None)
         """
         super(CountVectorizer, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
@@ -527,11 +542,11 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav
 
     @keyword_only
     @since("1.6.0")
-    def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,
-                  outputCol=None):
+    def setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,
+                  inputCol=None, outputCol=None):
         """
-        setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,\
-                  outputCol=None)
+        setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 18, binary=False,\
+                  inputCol=None, outputCol=None)
         Set the params for the CountVectorizer
         """
         kwargs = self._input_kwargs
@@ -551,6 +566,13 @@ class CountVectorizer(JavaEstimator, _CountVectorizerParams, JavaMLReadable, Jav
         """
         return self._set(minDF=value)
 
+    @since("2.4.0")
+    def setMaxDF(self, value):
+        """
+        Sets the value of :py:attr:`maxDF`.
+        """
+        return self._set(maxDF=value)
+
     @since("1.6.0")
     def setVocabSize(self, value):
         """

http://git-wip-us.apache.org/repos/asf/spark/blob/a3365534/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 0801199..cf1ffa1 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -697,6 +697,31 @@ class FeatureTests(SparkSessionTestCase):
             feature, expected = r
             self.assertEqual(feature, expected)
 
+    def test_count_vectorizer_with_maxDF(self):
+        dataset = self.spark.createDataFrame([
+            (0, "a b c d".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),),
+            (1, "a b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),),
+            (2, "a b".split(' '), SparseVector(3, {0: 1.0}),),
+            (3, "a".split(' '), SparseVector(3,  {}),)], ["id", "words", "expected"])
+        cv = CountVectorizer(inputCol="words", outputCol="features")
+        model1 = cv.setMaxDF(3).fit(dataset)
+        self.assertEqual(model1.vocabulary, ['b', 'c', 'd'])
+
+        transformedList1 = model1.transform(dataset).select("features", "expected").collect()
+
+        for r in transformedList1:
+            feature, expected = r
+            self.assertEqual(feature, expected)
+
+        model2 = cv.setMaxDF(0.75).fit(dataset)
+        self.assertEqual(model2.vocabulary, ['b', 'c', 'd'])
+
+        transformedList2 = model2.transform(dataset).select("features", "expected").collect()
+
+        for r in transformedList2:
+            feature, expected = r
+            self.assertEqual(feature, expected)
+
     def test_count_vectorizer_from_vocab(self):
         model = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words",
                                                      outputCol="features", minTF=2)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org