You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ml...@apache.org on 2016/04/09 09:57:26 UTC

spark git commit: [SPARK-14392][ML] CountVectorizer Estimator should include binary toggle Param

Repository: spark
Updated Branches:
  refs/heads/master 90c0a0450 -> a9b8b655b


[SPARK-14392][ML] CountVectorizer Estimator should include binary toggle Param

## What changes were proposed in this pull request?

CountVectorizerModel has a binary toggle param. This PR is to add binary toggle param for estimator CountVectorizer. As discussed in the JIRA, instead of adding a param into CountVerctorizer, I moved the binary param to CountVectorizerParams. Therefore, the estimator inherits the binary param.

## How was this patch tested?

Add a new test case, which fits the model with binary flag set to true and then check the trained model's all non-zero counts is set to 1.0.

All tests in CounterVectorizerSuite.scala are passed.

Author: wm624@hotmail.com <wm...@hotmail.com>

Closes #12200 from wangmiao1981/binary_param.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a9b8b655
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a9b8b655
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a9b8b655

Branch: refs/heads/master
Commit: a9b8b655b25f4ed519037faaf7601a3d9842547f
Parents: 90c0a04
Author: wm624@hotmail.com <wm...@hotmail.com>
Authored: Sat Apr 9 09:57:07 2016 +0200
Committer: Nick Pentreath <ni...@gmail.com>
Committed: Sat Apr 9 09:57:07 2016 +0200

----------------------------------------------------------------------
 .../spark/ml/feature/CountVectorizer.scala      | 35 ++++++++++----------
 .../spark/ml/feature/CountVectorizerSuite.scala | 19 +++++++++--
 2 files changed, 34 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a9b8b655/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 5694b38..f1be971 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -100,6 +100,21 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit
 
   /** @group getParam */
   def getMinTF: Double = $(minTF)
+
+  /**
+   * Binary toggle to control the output vector values.
+   * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
+   * discrete probabilistic models that model binary events rather than integer counts.
+   * Default: false
+   * @group param
+   */
+  val binary: BooleanParam =
+    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1.")
+
+  /** @group getParam */
+  def getBinary: Boolean = $(binary)
+
+  setDefault(binary -> false)
 }
 
 /**
@@ -127,6 +142,9 @@ class CountVectorizer(override val uid: String)
   /** @group setParam */
   def setMinTF(value: Double): this.type = set(minTF, value)
 
+  /** @group setParam */
+  def setBinary(value: Boolean): this.type = set(binary, value)
+
   setDefault(vocabSize -> (1 << 18), minDF -> 1)
 
   override def fit(dataset: DataFrame): CountVectorizerModel = {
@@ -206,26 +224,9 @@ class CountVectorizerModel(override val uid: String, val vocabulary: Array[Strin
   /** @group setParam */
   def setMinTF(value: Double): this.type = set(minTF, value)
 
-  /**
-   * Binary toggle to control the output vector values.
-   * If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for
-   * discrete probabilistic models that model binary events rather than integer counts.
-   * Default: false
-   * @group param
-   */
-  val binary: BooleanParam =
-    new BooleanParam(this, "binary", "If True, all non zero counts are set to 1. " +
-      "This is useful for discrete probabilistic models that model binary events rather " +
-      "than integer counts")
-
-  /** @group getParam */
-  def getBinary: Boolean = $(binary)
-
   /** @group setParam */
   def setBinary(value: Boolean): this.type = set(binary, value)
 
-  setDefault(binary -> false)
-
   /** Dictionary created from [[vocabulary]] and its indices, broadcast once for [[transform()]] */
   private var broadcastDict: Option[Broadcast[Map[String, Int]]] = None
 

http://git-wip-us.apache.org/repos/asf/spark/blob/a9b8b655/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
index 04f165c..ff0de06 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala
@@ -168,21 +168,34 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext
     }
   }
 
-  test("CountVectorizerModel with binary") {
+  test("CountVectorizerModel and CountVectorizer with binary") {
     val df = sqlContext.createDataFrame(Seq(
-      (0, split("a a a b b c"), Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0)))),
+      (0, split("a a a a b b b b c d"),
+      Vectors.sparse(4, Seq((0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)))),
       (1, split("c c c"), Vectors.sparse(4, Seq((2, 1.0)))),
       (2, split("a"), Vectors.sparse(4, Seq((0, 1.0))))
     )).toDF("id", "words", "expected")
 
-    val cv = new CountVectorizerModel(Array("a", "b", "c", "d"))
+    // CountVectorizer test
+    val cv = new CountVectorizer()
       .setInputCol("words")
       .setOutputCol("features")
       .setBinary(true)
+      .fit(df)
     cv.transform(df).select("features", "expected").collect().foreach {
       case Row(features: Vector, expected: Vector) =>
         assert(features ~== expected absTol 1e-14)
     }
+
+    // CountVectorizerModel test
+    val cv2 = new CountVectorizerModel(cv.vocabulary)
+      .setInputCol("words")
+      .setOutputCol("features")
+      .setBinary(true)
+    cv2.transform(df).select("features", "expected").collect().foreach {
+      case Row(features: Vector, expected: Vector) =>
+        assert(features ~== expected absTol 1e-14)
+    }
   }
 
   test("CountVectorizer read/write") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org