You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/08/29 17:13:36 UTC
[systemds] branch master updated: [SYSTEMDS-3108] Fix distributed
kmeans to avoid unnecessary cpmm
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 36a5fcf [SYSTEMDS-3108] Fix distributed kmeans to avoid unnecessary cpmm
36a5fcf is described below
commit 36a5fcf48c07e2e0cd268f97a6ad49e95c5490ca
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Sun Aug 29 19:12:44 2021 +0200
[SYSTEMDS-3108] Fix distributed kmeans to avoid unnecessary cpmm
This patch fixes issues with large-scale k-means, where due to unknowns
we compile a CPMM that runs with a single task causing large shuffle
costs and instability. Interestingly, we already support the necessary
block index expressions during size inference, and correctly obtain the
sizes during initial compilation, but because additional rewrites
destroy the matches expression pattern later during IPA and
recompilation we can no longer infer the size and fall back to CPMM.
We will address this in general with sufficient tests, but for now need
to harden K-means.
---
scripts/builtin/kmeans.dml | 2 +-
.../sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java | 6 +++---
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index 89d3282..33aaa03 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml
@@ -210,6 +210,7 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
+ "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss);
C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
+ while(FALSE){} # workaround to make ncol t(C) known
D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
P = (D <= rowMins (D));
aggr_P = t(cumsum (t(P)));
@@ -217,7 +218,6 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
if(is_verbose)
print("dim C=" + nrow(C) + "x" + ncol(C) + ", dim Y=" + nrow(Y) + "x" + ncol(Y))
-
}
else{
print ("K-means: No output is produced. Try increasing the number of iterations and/or lower eps.");
diff --git a/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java b/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
index 5a29e6b..63a05a4 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/RewriteAlgebraicSimplificationDynamic.java
@@ -223,8 +223,8 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
HopRewriteUtils.cleanupUnreferenced(hi, input);
hi = hnew;
- LOG.debug("Applied removeEmptyRightIndexing");
- }
+ LOG.debug("Applied removeEmptyRightIndexing (line "+hi.getBeginLine()+")");
+ }
}
return hi;
@@ -239,7 +239,7 @@ public class RewriteAlgebraicSimplificationDynamic extends HopRewriteRule
HopRewriteUtils.cleanupUnreferenced(hi);
hi = input;
- LOG.debug("Applied removeUnnecessaryRightIndexing");
+ LOG.debug("Applied removeUnnecessaryRightIndexing (line "+hi.getBeginLine()+")");
}
return hi;