You are viewing a plain text version of this content. The canonical link for it is here.

Posted to reviews@spark.apache.org by GitBox <gi...@apache.org> on 2019/10/09 00:52:11 UTC

[GitHub] [spark] firestarman commented on a change in pull request #25983: [SPARK-29327][MLLIB]Support specifying features via multiple columns

firestarman commented on a change in pull request #25983: [SPARK-29327][MLLIB]Support specifying features via multiple columns
URL: https://github.com/apache/spark/pull/25983#discussion_r332791624
 
 

 ##########
 File path: mllib/src/test/scala/org/apache/spark/ml/PredictorSuite.scala
 ##########
 @@ -55,14 +55,50 @@ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {
       predictor.fit(df.select(col("label"), col("weight").cast(StringType), col("features")))
     }
   }
+
+  test("multiple columns for features should work well without side effect") {
+    // Should fail due to not supporting multiple columns
+    intercept[IllegalArgumentException] {
+      new MockPredictor(false).setFeaturesCol(Array("feature1", "feature2", "feature3"))
+    }
+
+    // Only use multiple columns for features
+    val df = spark.createDataFrame(Seq(
+      (0, 1, 0, 2, 3),
+      (1, 2, 0, 3, 9),
+      (0, 3, 0, 2, 6)
+    )).toDF("label", "weight", "feature1", "feature2", "feature3")
+
+    val predictor = new MockPredictor().setWeightCol("weight")
+      .setFeaturesCol(Array("feature1", "feature2", "feature3"))
+    predictor.fit(df)
+
+    // Should fail due to wrong type for column "feature1" in schema
+    intercept[IllegalArgumentException] {
+      predictor.fit(df.select(col("label"), col("weight"),
+        col("feature1").cast(StringType), col("feature2"), col("feature3")))
+    }
+
+    val df2 = df.toDF("label", "weight", "features", "feature2", "feature3")
+    // Should fail due to missing "feature1" in schema
+    intercept[IllegalArgumentException] {
+      predictor.setFeaturesCol(Array("feature1", "feature2", "feature3")).fit(df2)
+    }
+
+    // Should fail due to wrong type in schema for single column of features
 
 Review comment:
   This is for the case users are specifying both single column and multiple columns for features. `features` (The default value) now is used as the single column name so its type should be Vector, but actually is Int .

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org