You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/08/29 17:13:36 UTC

[systemds] branch master updated: [SYSTEMDS-3108] Fix distributed kmeans to avoid unnecessary cpmm

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 36a5fcf  [SYSTEMDS-3108] Fix distributed kmeans to avoid unnecessary cpmm
36a5fcf is described below

commit 36a5fcf48c07e2e0cd268f97a6ad49e95c5490ca
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sun Aug 29 19:12:44 2021 +0200

    [SYSTEMDS-3108] Fix distributed kmeans to avoid unnecessary cpmm
    
    This patch fixes issues with large-scale k-means, where due to unknowns
    we compile a CPMM that runs with a single task causing large shuffle
    costs and instability. Interestingly, we already support the necessary
    block index expressions during size inference, and correctly obtain the
    sizes during initial compilation, but because additional rewrites
    destroy the matches expression pattern later during IPA and
    recompilation we can no longer infer the size and fall back to CPMM.
    
    We will address this in general with sufficient tests, but for now need
    to harden K-means.
---
 scripts/builtin/kmeans.dml                                          | 2 +-
 .../sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index 89d3282..33aaa03 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml
@@ -210,6 +210,7 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
              + ";  Avg WCSS = " + avg_wcss + ";  Worst WCSS = " + worst_wcss);
 
     C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
+    while(FALSE){} # workaround to make ncol t(C) known
     D =  -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
     P = (D <= rowMins (D));
     aggr_P = t(cumsum (t(P)));
@@ -217,7 +218,6 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
 
     if(is_verbose)
       print("dim C=" + nrow(C) + "x" + ncol(C) + ", dim Y=" + nrow(Y) + "x" + ncol(Y))
-
   }
   else{
     print ("K-means: No output is produced. Try increasing the number of iterations and/or lower eps.");
diff --git a/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java b/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
index 5a29e6b..63a05a4 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
@@ -223,8 +223,8 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 				HopRewriteUtils.cleanupUnreferenced(hi, input);
 				hi = hnew;
 				
-				LOG.debug("Applied removeEmptyRightIndexing");
-			}			
+				LOG.debug("Applied removeEmptyRightIndexing (line "+hi.getBeginLine()+")");
+			}
 		}
 		
 		return hi;
@@ -239,7 +239,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
 			HopRewriteUtils.cleanupUnreferenced(hi);
 			hi = input;
 			
-			LOG.debug("Applied removeUnnecessaryRightIndexing");
+			LOG.debug("Applied removeUnnecessaryRightIndexing  (line "+hi.getBeginLine()+")");
 		}
 		
 		return hi;