You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2018/02/22 01:26:36 UTC
spark git commit: [SPARK-22700][ML] Bucketizer.transform incorrectly drops row containing NaN - for branch-2.2

Repository: spark
Updated Branches:
  refs/heads/branch-2.2 a95c3e29d -> 1cc34f3e5


[SPARK-22700][ML] Bucketizer.transform incorrectly drops row containing NaN - for branch-2.2

## What changes were proposed in this pull request?
for branch-2.2
only drops the rows containing NaN in the input columns

## How was this patch tested?

existing tests and added tests

Author: Zheng RuiFeng <ru...@foxmail.com>

Closes #20539 from zhengruifeng/bucketizer_nan_2.2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1cc34f3e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1cc34f3e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1cc34f3e

Branch: refs/heads/branch-2.2
Commit: 1cc34f3e58c92dd06545727e9d931008a1082bbf
Parents: a95c3e2
Author: Zheng RuiFeng <ru...@foxmail.com>
Authored: Wed Feb 21 17:26:33 2018 -0800
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Wed Feb 21 17:26:33 2018 -0800

----------------------------------------------------------------------
 .../main/scala/org/apache/spark/ml/feature/Bucketizer.scala | 2 +-
 .../scala/org/apache/spark/ml/feature/BucketizerSuite.scala | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1cc34f3e/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index bb8f2a3..f585ff0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -106,7 +106,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
     val (filteredDataset, keepInvalid) = {
       if (getHandleInvalid == Bucketizer.SKIP_INVALID) {
         // "skip" NaN option is set, will filter out NaN values in the dataset
-        (dataset.na.drop().toDF(), false)
+        (dataset.na.drop(Seq($(inputCol))).toDF(), false)
       } else {
         (dataset.toDF(), getHandleInvalid == Bucketizer.KEEP_INVALID)
       }

http://git-wip-us.apache.org/repos/asf/spark/blob/1cc34f3e/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index 420fb17..32e50a9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -187,6 +187,15 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
       }
     }
   }
+
+  test("Bucketizer should only drop NaN in input columns, with handleInvalid=skip") {
+    val df = spark.createDataFrame(Seq((2.3, 3.0), (Double.NaN, 3.0), (6.7, Double.NaN)))
+      .toDF("a", "b")
+    val splits = Array(Double.NegativeInfinity, 3.0, Double.PositiveInfinity)
+    val bucketizer = new Bucketizer().setInputCol("a").setOutputCol("x").setSplits(splits)
+    bucketizer.setHandleInvalid("skip")
+    assert(bucketizer.transform(df).count() == 2)
+  }
 }
 
 private object BucketizerSuite extends SparkFunSuite {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org