You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ba...@apache.org on 2020/10/29 20:46:09 UTC

[systemds] branch master updated: [MINOR] K-means seeded fix

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 4ff3819  [MINOR] K-means seeded fix
4ff3819 is described below

commit 4ff3819c20eb5035749cfd88d32c92848e187207
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Thu Oct 29 21:42:44 2020 +0100

    [MINOR] K-means seeded fix
    
    This commit change the seeding of k-means to actually work.
    Since the last change to add the seed, unfortunately, made each initial
    cluster assign to the same location, making the entire selection and
    sampling in the beginning invalid.
    Now the seed are incremented and different for each of the initial values.
---
 scripts/builtin/kmeans.dml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index 4b407f0..646ec34 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml
@@ -67,7 +67,7 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
     print ("Taking data samples for initialization...");
 
   [sample_maps, samples_vs_runs_map, sample_block_size] = get_sample_maps(
-    num_records, num_runs, num_centroids * avg_sample_size_per_centroid);
+    num_records, num_runs, num_centroids * avg_sample_size_per_centroid, seed);
 
   is_row_in_samples = rowSums (sample_maps);
   X_samples = sample_maps %*% X;
@@ -84,6 +84,8 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
 
   min_distances = is_row_in_samples;  # Pick the 1-st centroids uniformly at random
 
+  seed = ifelse(seed == -1, -1, seed + 1)
+  random_row = rand(rows = 1, cols = num_runs, seed = seed);
   for (i in 1 : num_centroids)
   {
     # "Matricize" and prefix-sum to compute the cumulative distribution function:
@@ -92,7 +94,7 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
 
     # Select the i-th centroid in each sample as a random sample row id with
     # probability ~ min_distances:
-    random_row = rand(rows = 1, cols = num_runs, seed = seed);
+    random_row = random_row + rand(rows = 1, cols = 1, seed = seed + i)
     threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
     centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1;
 
@@ -222,12 +224,10 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer
     C = matrix(0, num_centroids,  num_records)
     Y = matrix(-1, 1, num_records)
   }
-
-
 }
 
 
-get_sample_maps = function (int num_records, int num_samples, int approx_sample_size)
+get_sample_maps = function (int num_records, int num_samples, int approx_sample_size, int seed)
                   return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size)
 {
   if (approx_sample_size < num_records)
@@ -239,7 +239,7 @@ get_sample_maps = function (int num_records, int num_samples, int approx_sample_
 
     # Generate all samples in parallel by converting uniform random values into random
     # integer skip-ahead intervals and prefix-summing them:
-    sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0);
+    sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0, seed = seed);
     sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5);
     # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one]
     sample_rec_ids = cumsum (sample_rec_ids);  #  (skip to next one) --> (skip to i-th one)