You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2018/01/24 18:13:50 UTC

spark git commit: [SPARK-23152][ML] - Correctly guard against empty datasets

Repository: spark
Updated Branches:
  refs/heads/master bbb87b350 -> 840dea64a


[SPARK-23152][ML] - Correctly guard against empty datasets

## What changes were proposed in this pull request?

Correctly guard against empty datasets in `org.apache.spark.ml.classification.Classifier`

## How was this patch tested?

existing tests

Author: Matthew Tovbin <mt...@salesforce.com>

Closes #20321 from tovbinm/SPARK-23152.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/840dea64
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/840dea64
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/840dea64

Branch: refs/heads/master
Commit: 840dea64abd8a3a5960de830f19a57f5f1aa3bf6
Parents: bbb87b3
Author: Matthew Tovbin <mt...@salesforce.com>
Authored: Wed Jan 24 13:13:44 2018 -0500
Committer: Sean Owen <so...@cloudera.com>
Committed: Wed Jan 24 13:13:44 2018 -0500

----------------------------------------------------------------------
 .../scala/org/apache/spark/ml/classification/Classifier.scala | 2 +-
 .../org/apache/spark/ml/classification/ClassifierSuite.scala  | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/840dea64/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
index bc0b49d..9d1d5aa 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -109,7 +109,7 @@ abstract class Classifier[
       case None =>
         // Get number of classes from dataset itself.
         val maxLabelRow: Array[Row] = dataset.select(max($(labelCol))).take(1)
-        if (maxLabelRow.isEmpty) {
+        if (maxLabelRow.isEmpty || maxLabelRow(0).get(0) == null) {
           throw new SparkException("ML algorithm was given empty dataset.")
         }
         val maxDoubleLabel: Double = maxLabelRow.head.getDouble(0)

http://git-wip-us.apache.org/repos/asf/spark/blob/840dea64/mllib/src/test/scala/org/apache/spark/ml/classification/ClassifierSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/ClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/ClassifierSuite.scala
index de71207..87bf2be 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/ClassifierSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/ClassifierSuite.scala
@@ -90,6 +90,13 @@ class ClassifierSuite extends SparkFunSuite with MLlibTestSparkContext {
       }
       assert(e.getMessage.contains("requires integers in range"))
     }
+    val df3 = getTestData(Seq.empty[Double])
+    withClue("getNumClasses should fail if dataset is empty") {
+      val e: SparkException = intercept[SparkException] {
+        c.getNumClasses(df3)
+      }
+      assert(e.getMessage == "ML algorithm was given empty dataset.")
+    }
   }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org