You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/08/15 10:05:09 UTC
[spark] branch branch-3.2 updated: [SPARK-40079] Add Imputer inputCols validation for empty input case

This is an automated email from the ASF dual-hosted git repository.

weichenxu123 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new 2b54b48cd85 [SPARK-40079] Add Imputer inputCols validation for empty input case
2b54b48cd85 is described below

commit 2b54b48cd852f93e8cf24397df6f3ec5b755233e
Author: Weichen Xu <we...@databricks.com>
AuthorDate: Mon Aug 15 18:03:08 2022 +0800

    [SPARK-40079] Add Imputer inputCols validation for empty input case
    
    Signed-off-by: Weichen Xu <weichen.xudatabricks.com>
    
    ### What changes were proposed in this pull request?
    Add Imputer inputCols validation for empty input case
    
    ### Why are the changes needed?
    If Imputer inputCols is empty, the `fit` works fine but when saving model, error will be raised:
    
    >
    AnalysisException:
    Datasource does not support writing empty or nested empty schemas.
    Please make sure the data schema has at least one or more column(s).
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Unit test.
    
    Closes #37518 from WeichenXu123/imputer-param-validation.
    
    Authored-by: Weichen Xu <we...@databricks.com>
    Signed-off-by: Weichen Xu <we...@databricks.com>
    (cherry picked from commit 87094f89655b7df09cdecb47c653461ae855b0ac)
    Signed-off-by: Weichen Xu <we...@databricks.com>
---
 mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala |  1 +
 .../test/scala/org/apache/spark/ml/feature/ImputerSuite.scala  | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 71403acc91b..5998887923f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -81,6 +81,7 @@ private[feature] trait ImputerParams extends Params with HasInputCol with HasInp
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     ParamValidators.checkSingleVsMultiColumnParams(this, Seq(outputCol), Seq(outputCols))
     val (inputColNames, outputColNames) = getInOutCols()
+    require(inputColNames.length > 0, "inputCols cannot be empty")
     require(inputColNames.length == inputColNames.distinct.length, s"inputCols contains" +
       s" duplicates: (${inputColNames.mkString(", ")})")
     require(outputColNames.length == outputColNames.distinct.length, s"outputCols contains" +
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
index 30887f55638..5ef22a282c3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ImputerSuite.scala
@@ -268,6 +268,16 @@ class ImputerSuite extends MLTest with DefaultReadWriteTest {
         }
         assert(e.getMessage.contains("outputCols contains duplicates"))
       }
+
+      withClue("Imputer should fail if inputCols param is empty.") {
+        val e: IllegalArgumentException = intercept[IllegalArgumentException] {
+          val imputer = new Imputer().setStrategy(strategy)
+            .setInputCols(Array[String]())
+            .setOutputCols(Array[String]())
+          val model = imputer.fit(df)
+        }
+        assert(e.getMessage.contains("inputCols cannot be empty"))
+      }
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org