You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hu...@apache.org on 2022/01/19 17:20:34 UTC
[spark] branch branch-3.2 updated: [SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans
This is an automated email from the ASF dual-hosted git repository.
huaxingao pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push:
new 5cf8108 [SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans
5cf8108 is described below
commit 5cf810870073693f7ec2e1f2efe030567c973fb4
Author: Ruifeng Zheng <ru...@foxmail.com>
AuthorDate: Wed Jan 19 09:17:25 2022 -0800
[SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans
### What changes were proposed in this pull request?
In `KMeansSuite` and `BisectingKMeansSuite`, there are some unused lines:
```
model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0
```
For cosine distance, the norm of centering vector should be 1, so the norm checking is meaningful;
For euclidean distance, the norm checking is meaningless;
### Why are the changes needed?
to enable norm checking for cosine distance, and diable it for euclidean distance
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
updated testsuites
Closes #35247 from zhengruifeng/fix_kmeans_ut.
Authored-by: Ruifeng Zheng <ru...@foxmail.com>
Signed-off-by: huaxingao <hu...@gmail.com>
(cherry picked from commit 789fce8c8b200eba5f94c2d83b4b83e3bfb9a2b1)
Signed-off-by: huaxingao <hu...@gmail.com>
---
.../apache/spark/ml/clustering/BisectingKMeansSuite.scala | 10 +++-------
.../scala/org/apache/spark/ml/clustering/KMeansSuite.scala | 14 +++-----------
2 files changed, 6 insertions(+), 18 deletions(-)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 04b20d1..fb6110d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -186,7 +186,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
assert(predictionsMap(Vectors.dense(-1.0, 1.0)) ==
predictionsMap(Vectors.dense(-100.0, 90.0)))
- model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+ assert(model.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
}
test("Comparing with and without weightCol with cosine distance") {
@@ -217,7 +217,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
assert(predictionsMap1(Vectors.dense(-1.0, 1.0)) ==
predictionsMap1(Vectors.dense(-100.0, 90.0)))
- model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+ assert(model1.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
(Vectors.dense(1.0, 1.0), 2.0), (Vectors.dense(10.0, 10.0), 2.0),
@@ -244,7 +244,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
assert(predictionsMap2(Vectors.dense(-1.0, 1.0)) ==
predictionsMap2(Vectors.dense(-100.0, 90.0)))
- model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+ assert(model2.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
assert(model1.clusterCenters === model2.clusterCenters)
}
@@ -284,8 +284,6 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
assert(predictionsMap1(Vectors.dense(10.0, 10.0)) ==
predictionsMap1(Vectors.dense(10.0, 4.4)))
- model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
(Vectors.dense(1.0, 1.0), 1.0), (Vectors.dense(10.0, 10.0), 2.0),
(Vectors.dense(1.0, 0.5), 2.0), (Vectors.dense(10.0, 4.4), 3.0),
@@ -310,8 +308,6 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
assert(predictionsMap2(Vectors.dense(10.0, 10.0)) ==
predictionsMap2(Vectors.dense(10.0, 4.4)))
- model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
assert(model1.clusterCenters(0) === model2.clusterCenters(0))
assert(model1.clusterCenters(1) === model2.clusterCenters(1))
assert(model1.clusterCenters(2) ~== model2.clusterCenters(2) absTol 1e-6)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 61f4359..7d2a0b8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -186,7 +186,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(predictionsMap(Vectors.dense(-1.0, 1.0)) ==
predictionsMap(Vectors.dense(-100.0, 90.0)))
- model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+ assert(model.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
}
test("KMeans with cosine distance is not supported for 0-length vectors") {
@@ -283,7 +283,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(predictionsMap1(Vectors.dense(-1.0, 1.0)) ==
predictionsMap1(Vectors.dense(-100.0, 90.0)))
- model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+ assert(model1.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
(Vectors.dense(1.0, 1.0), 1.0),
@@ -313,7 +313,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(predictionsMap2(Vectors.dense(-1.0, 1.0)) ==
predictionsMap2(Vectors.dense(-100.0, 90.0)))
- model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+ assert(model2.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
// compare if model1 and model2 have the same cluster centers
assert(model1.clusterCenters.length === model2.clusterCenters.length)
@@ -350,8 +350,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(predictionsMap1(Vectors.dense(9.0, 0.2)) ==
predictionsMap1(Vectors.dense(9.2, 0.0)))
- model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
// center 1:
// total weights in cluster 1: 2.0 + 2.0 + 2.0 = 6.0
// x: 9.0 * (2.0/6.0) + 9.0 * (2.0/6.0) + 9.2 * (2.0/6.0) = 9.066666666666666
@@ -394,8 +392,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(predictionsMap2(Vectors.dense(9.0, 0.2)) ==
predictionsMap2(Vectors.dense(9.2, 0.0)))
- model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
// center 1:
// total weights in cluster 1: 2.5 + 1.0 + 2.0 = 5.5
// x: 9.0 * (2.5/5.5) + 9.0 * (1.0/5.5) + 9.2 * (2.0/5.5) = 9.072727272727272
@@ -441,8 +437,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(predictionsMap1(Vectors.dense(-6.0, -6.0)) ==
predictionsMap1(Vectors.dense(-10.0, -10.0)))
- model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
// use same weight, should have the same result as no weight
val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
(Vectors.dense(0.1, 0.1), 2.0),
@@ -474,8 +468,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
assert(predictionsMap2(Vectors.dense(-6.0, -6.0)) ==
predictionsMap2(Vectors.dense(-10.0, -10.0)))
- model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
assert(model1.clusterCenters === model2.clusterCenters)
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org