You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2014/07/18 00:05:15 UTC
git commit: SPARK-1215 [MLLIB]: Clustering: Index out of bounds error
(2)
Repository: spark
Updated Branches:
refs/heads/master 1fcd5dcdd -> 935fe65ff
SPARK-1215 [MLLIB]: Clustering: Index out of bounds error (2)
Added check to LocalKMeans.scala: kMeansPlusPlus initialization to handle case with fewer distinct data points than clusters k. Added two related unit tests to KMeansSuite. (Re-submitting PR after tangling commits in PR 1407 https://github.com/apache/spark/pull/1407 )
Author: Joseph K. Bradley <jo...@gmail.com>
Closes #1468 from jkbradley/kmeans-fix and squashes the following commits:
4e9bd1e [Joseph K. Bradley] Updated PR per comments from mengxr
6c7a2ec [Joseph K. Bradley] Added check to LocalKMeans.scala: kMeansPlusPlus initialization to handle case with fewer distinct data points than clusters k. Added two related unit tests to KMeansSuite.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/935fe65f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/935fe65f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/935fe65f
Branch: refs/heads/master
Commit: 935fe65ff6559a0e3b481e7508fa14337b23020b
Parents: 1fcd5dc
Author: Joseph K. Bradley <jo...@gmail.com>
Authored: Thu Jul 17 15:05:02 2014 -0700
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Thu Jul 17 15:05:02 2014 -0700
----------------------------------------------------------------------
.../spark/mllib/clustering/LocalKMeans.scala | 8 +++++-
.../spark/mllib/clustering/KMeansSuite.scala | 26 ++++++++++++++++++++
2 files changed, 33 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/935fe65f/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
index 2e3a4ce..f0722d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala
@@ -59,7 +59,13 @@ private[mllib] object LocalKMeans extends Logging {
cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j))
j += 1
}
- centers(i) = points(j-1).toDense
+ if (j == 0) {
+ logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." +
+ s" Using duplicate point for center k = $i.")
+ centers(i) = points(0).toDense
+ } else {
+ centers(i) = points(j - 1).toDense
+ }
}
// Run up to maxIterations iterations of Lloyd's algorithm
http://git-wip-us.apache.org/repos/asf/spark/blob/935fe65f/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
----------------------------------------------------------------------
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 560a4ad..76a3bdf 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -61,6 +61,32 @@ class KMeansSuite extends FunSuite with LocalSparkContext {
assert(model.clusterCenters.head === center)
}
+ test("no distinct points") {
+ val data = sc.parallelize(
+ Array(
+ Vectors.dense(1.0, 2.0, 3.0),
+ Vectors.dense(1.0, 2.0, 3.0),
+ Vectors.dense(1.0, 2.0, 3.0)),
+ 2)
+ val center = Vectors.dense(1.0, 2.0, 3.0)
+
+ // Make sure code runs.
+ var model = KMeans.train(data, k=2, maxIterations=1)
+ assert(model.clusterCenters.size === 2)
+ }
+
+ test("more clusters than points") {
+ val data = sc.parallelize(
+ Array(
+ Vectors.dense(1.0, 2.0, 3.0),
+ Vectors.dense(1.0, 3.0, 4.0)),
+ 2)
+
+ // Make sure code runs.
+ var model = KMeans.train(data, k=3, maxIterations=1)
+ assert(model.clusterCenters.size === 3)
+ }
+
test("single cluster with big dataset") {
val smallData = Array(
Vectors.dense(1.0, 2.0, 6.0),