You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2016/02/25 21:28:17 UTC
spark git commit: Revert "[SPARK-13444][MLLIB] QuantileDiscretizer
chooses bad splits on large DataFrames"
Repository: spark
Updated Branches:
refs/heads/branch-1.6 5f7440b25 -> d59a08f7c
Revert "[SPARK-13444][MLLIB] QuantileDiscretizer chooses bad splits on large DataFrames"
This reverts commit cb869a143d338985c3d99ef388dd78b1e3d90a73.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d59a08f7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d59a08f7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d59a08f7
Branch: refs/heads/branch-1.6
Commit: d59a08f7c1c455d86e7ee3d6522a3e9c55f9ee02
Parents: 5f7440b
Author: Xiangrui Meng <me...@databricks.com>
Authored: Thu Feb 25 12:28:03 2016 -0800
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Thu Feb 25 12:28:03 2016 -0800
----------------------------------------------------------------------
.../spark/ml/feature/QuantileDiscretizer.scala | 11 ++---------
.../ml/feature/QuantileDiscretizerSuite.scala | 20 --------------------
2 files changed, 2 insertions(+), 29 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/d59a08f7/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index cd5085a..7bf67c6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -97,13 +97,6 @@ final class QuantileDiscretizer(override val uid: String)
@Since("1.6.0")
object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] with Logging {
-
- /**
- * Minimum number of samples required for finding splits, regardless of number of bins. If
- * the dataset has fewer rows than this value, the entire dataset will be used.
- */
- private[spark] val minSamplesRequired: Int = 10000
-
/**
* Sampling from the given dataset to collect quantile statistics.
*/
@@ -111,8 +104,8 @@ object QuantileDiscretizer extends DefaultParamsReadable[QuantileDiscretizer] wi
val totalSamples = dataset.count()
require(totalSamples > 0,
"QuantileDiscretizer requires non-empty input dataset but was given an empty input.")
- val requiredSamples = math.max(numBins * numBins, minSamplesRequired)
- val fraction = math.min(requiredSamples.toDouble / dataset.count(), 1.0)
+ val requiredSamples = math.max(numBins * numBins, 10000)
+ val fraction = math.min(requiredSamples / dataset.count(), 1.0)
dataset.sample(withReplacement = false, fraction, new XORShiftRandom().nextInt()).collect()
}
http://git-wip-us.apache.org/repos/asf/spark/blob/d59a08f7/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
index 32bfa43..3a4f6d2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
@@ -71,26 +71,6 @@ class QuantileDiscretizerSuite
}
}
- test("Test splits on dataset larger than minSamplesRequired") {
- val sqlCtx = SQLContext.getOrCreate(sc)
- import sqlCtx.implicits._
-
- val datasetSize = QuantileDiscretizer.minSamplesRequired + 1
- val numBuckets = 5
- val df = sc.parallelize((1.0 to datasetSize by 1.0).map(Tuple1.apply)).toDF("input")
- val discretizer = new QuantileDiscretizer()
- .setInputCol("input")
- .setOutputCol("result")
- .setNumBuckets(numBuckets)
- .setSeed(1)
-
- val result = discretizer.fit(df).transform(df)
- val observedNumBuckets = result.select("result").distinct.count
-
- assert(observedNumBuckets === numBuckets,
- "Observed number of buckets does not equal expected number of buckets.")
- }
-
test("read/write") {
val t = new QuantileDiscretizer()
.setInputCol("myInputCol")
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org