You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hu...@apache.org on 2022/01/19 17:20:34 UTC

[spark] branch branch-3.2 updated: [SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans

This is an automated email from the ASF dual-hosted git repository.

huaxingao pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new 5cf8108  [SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans
5cf8108 is described below

commit 5cf810870073693f7ec2e1f2efe030567c973fb4
Author: Ruifeng Zheng <ru...@foxmail.com>
AuthorDate: Wed Jan 19 09:17:25 2022 -0800

    [SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans
    
    ### What changes were proposed in this pull request?
    
    In `KMeansSuite` and `BisectingKMeansSuite`, there are some unused lines:
    
    ```
    model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0
    ```
    
    For cosine distance, the norm of centering vector should be 1, so the norm checking is meaningful;
    
    For euclidean distance, the norm checking is meaningless;
    
    ### Why are the changes needed?
    
    to enable norm checking for cosine distance, and diable it for euclidean distance
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    updated testsuites
    
    Closes #35247 from zhengruifeng/fix_kmeans_ut.
    
    Authored-by: Ruifeng Zheng <ru...@foxmail.com>
    Signed-off-by: huaxingao <hu...@gmail.com>
    (cherry picked from commit 789fce8c8b200eba5f94c2d83b4b83e3bfb9a2b1)
    Signed-off-by: huaxingao <hu...@gmail.com>
---
 .../apache/spark/ml/clustering/BisectingKMeansSuite.scala  | 10 +++-------
 .../scala/org/apache/spark/ml/clustering/KMeansSuite.scala | 14 +++-----------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
index 04b20d1..fb6110d 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -186,7 +186,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
     assert(predictionsMap(Vectors.dense(-1.0, 1.0)) ==
       predictionsMap(Vectors.dense(-100.0, 90.0)))
 
-    model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+    assert(model.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
   }
 
   test("Comparing with and without weightCol with cosine distance") {
@@ -217,7 +217,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
     assert(predictionsMap1(Vectors.dense(-1.0, 1.0)) ==
       predictionsMap1(Vectors.dense(-100.0, 90.0)))
 
-    model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+    assert(model1.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
 
     val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
       (Vectors.dense(1.0, 1.0), 2.0), (Vectors.dense(10.0, 10.0), 2.0),
@@ -244,7 +244,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
     assert(predictionsMap2(Vectors.dense(-1.0, 1.0)) ==
       predictionsMap2(Vectors.dense(-100.0, 90.0)))
 
-    model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+    assert(model2.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
     assert(model1.clusterCenters === model2.clusterCenters)
   }
 
@@ -284,8 +284,6 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
     assert(predictionsMap1(Vectors.dense(10.0, 10.0)) ==
       predictionsMap1(Vectors.dense(10.0, 4.4)))
 
-    model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
     val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
       (Vectors.dense(1.0, 1.0), 1.0), (Vectors.dense(10.0, 10.0), 2.0),
       (Vectors.dense(1.0, 0.5), 2.0), (Vectors.dense(10.0, 4.4), 3.0),
@@ -310,8 +308,6 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest {
     assert(predictionsMap2(Vectors.dense(10.0, 10.0)) ==
       predictionsMap2(Vectors.dense(10.0, 4.4)))
 
-    model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
     assert(model1.clusterCenters(0) === model2.clusterCenters(0))
     assert(model1.clusterCenters(1) === model2.clusterCenters(1))
     assert(model1.clusterCenters(2) ~== model2.clusterCenters(2) absTol 1e-6)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 61f4359..7d2a0b8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -186,7 +186,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
     assert(predictionsMap(Vectors.dense(-1.0, 1.0)) ==
       predictionsMap(Vectors.dense(-100.0, 90.0)))
 
-    model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+    assert(model.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
   }
 
   test("KMeans with cosine distance is not supported for 0-length vectors") {
@@ -283,7 +283,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
     assert(predictionsMap1(Vectors.dense(-1.0, 1.0)) ==
       predictionsMap1(Vectors.dense(-100.0, 90.0)))
 
-    model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+    assert(model1.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
 
     val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
       (Vectors.dense(1.0, 1.0), 1.0),
@@ -313,7 +313,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
     assert(predictionsMap2(Vectors.dense(-1.0, 1.0)) ==
       predictionsMap2(Vectors.dense(-100.0, 90.0)))
 
-    model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
+    assert(model2.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6))
 
     // compare if model1 and model2 have the same cluster centers
     assert(model1.clusterCenters.length === model2.clusterCenters.length)
@@ -350,8 +350,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
     assert(predictionsMap1(Vectors.dense(9.0, 0.2)) ==
       predictionsMap1(Vectors.dense(9.2, 0.0)))
 
-    model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
     // center 1:
     // total weights in cluster 1: 2.0 + 2.0 + 2.0 = 6.0
     // x: 9.0 * (2.0/6.0) + 9.0 * (2.0/6.0) + 9.2 * (2.0/6.0) = 9.066666666666666
@@ -394,8 +392,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
     assert(predictionsMap2(Vectors.dense(9.0, 0.2)) ==
       predictionsMap2(Vectors.dense(9.2, 0.0)))
 
-    model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
     // center 1:
     // total weights in cluster 1: 2.5 + 1.0 + 2.0 = 5.5
     // x: 9.0 * (2.5/5.5) + 9.0 * (1.0/5.5) + 9.2 * (2.0/5.5) = 9.072727272727272
@@ -441,8 +437,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
     assert(predictionsMap1(Vectors.dense(-6.0, -6.0)) ==
       predictionsMap1(Vectors.dense(-10.0, -10.0)))
 
-    model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
     // use same weight, should have the same result as no weight
     val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq(
       (Vectors.dense(0.1, 0.1), 2.0),
@@ -474,8 +468,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes
     assert(predictionsMap2(Vectors.dense(-6.0, -6.0)) ==
       predictionsMap2(Vectors.dense(-10.0, -10.0)))
 
-    model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
-
     assert(model1.clusterCenters === model2.clusterCenters)
   }
 }

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org