You are viewing a plain text version of this content. The canonical link for it is here.

Posted to reviews@spark.apache.org by WeichenXu123 <gi...@git.apache.org> on 2018/01/19 18:58:57 UTC

[GitHub] spark pull request #17123: [SPARK-19781][ML] Handle NULLs as well as NaNs in...

Github user WeichenXu123 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/17123#discussion_r162703633
  
    --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala ---
    @@ -105,20 +106,21 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
         transformSchema(dataset.schema)
         val (filteredDataset, keepInvalid) = {
           if (getHandleInvalid == Bucketizer.SKIP_INVALID) {
    -        // "skip" NaN option is set, will filter out NaN values in the dataset
    +        // "skip" NaN/NULL option is set, will filter out NaN/NULL values in the dataset
             (dataset.na.drop().toDF(), false)
           } else {
             (dataset.toDF(), getHandleInvalid == Bucketizer.KEEP_INVALID)
           }
         }
     
    -    val bucketizer: UserDefinedFunction = udf { (feature: Double) =>
    --- End diff --
    
    As @cloud-fan suggested, `Option[Double]` is better. :-)


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org