You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/26 02:13:10 UTC
[46/55] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/Kmeans.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Kmeans.dml b/scripts/algorithms/Kmeans.dml
index 3ec47a8..2887baa 100644
--- a/scripts/algorithms/Kmeans.dml
+++ b/scripts/algorithms/Kmeans.dml
@@ -1,282 +1,282 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Implements the k-Means clustering algorithm
-#
-# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X String --- Location to read matrix X with the input data records
-# k Int --- Number of centroids
-# runs Int 10 Number of runs (with different initial centroids)
-# maxi Int 1000 Maximum number of iterations per run
-# tol Double 0.000001 Tolerance (epsilon) for WCSS change ratio
-# samp Int 50 Average number of records per centroid in data samples
-# C String "C.mtx" Location to store the output matrix with the centroids
-# isY Int 0 0 = do not write Y, 1 = write Y
-# Y String "Y.mtx" Location to store the mapping of records to centroids
-# fmt String "text" Matrix output format, usually "text" or "csv"
-# verb Int 0 0 = do not print per-iteration stats, 1 = print them
-# ----------------------------------------------------------------------------
-#
-# Example:
-# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx
-# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1
-
-fileX = $X;
-fileY = ifdef ($Y, "Y.mtx");
-fileC = ifdef ($C, "C.mtx");
-
-num_centroids = $k;
-num_runs = ifdef ($runs, 10); # $runs=10;
-max_iter = ifdef ($maxi, 1000); # $maxi=1000;
-eps = ifdef ($tol, 0.000001); # $tol=0.000001;
-is_write_Y = ifdef ($isY, 0); # $isY=0;
-is_verbose = ifdef ($verb, 0); # $verb=0;
-fmtCY = ifdef ($fmt, "text"); # $fmt="text";
-avg_sample_size_per_centroid = ifdef ($samp, 50); # $samp=50;
-
-
-print ("BEGIN K-MEANS SCRIPT");
-print ("Reading X...");
-
-# X : matrix of data points as rows
-X = read (fileX);
-num_records = nrow (X);
-num_features = ncol (X);
-
-sumXsq = sum (X ^ 2);
-# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B))
-
-# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:
-
-print ("Taking data samples for initialization...");
-
-[sample_maps, samples_vs_runs_map, sample_block_size] =
- get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid);
-
-is_row_in_samples = rowSums (sample_maps);
-X_samples = sample_maps %*% X;
-X_samples_sq_norms = rowSums (X_samples ^ 2);
-
-print ("Initializing the centroids for all runs...");
-All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features);
-
-# We select centroids according to the k-Means++ heuristic applied to a sample of X
-# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids,
-# with the out-of-range X_sample positions in min_distances set to 0.0
-
-min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random
-
-for (i in 1 : num_centroids)
-{
- # "Matricize" and prefix-sum to compute the cumulative distribution function:
- min_distances_matrix_form =
- matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE);
- cdf_min_distances = cumsum (min_distances_matrix_form);
-
- # Select the i-th centroid in each sample as a random sample row id with
- # probability ~ min_distances:
- random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);
- threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
- centroid_ids = t(colSums (ppred (cdf_min_distances, threshold_matrix, "<"))) + 1;
-
- # Place the selected centroids together, one per run, into a matrix:
- centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs));
- centroid_placer_raw =
- table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids);
- centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
- centroids = centroid_placer %*% X_samples;
-
- # Place the selected centroids into their appropriate slots in All_Centroids:
- centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
- centroid_placer_raw =
- table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1));
- centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
- All_Centroids = All_Centroids + centroid_placer %*% centroids;
-
- # Update min_distances to preserve the loop invariant:
- distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2)
- - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
- if (i == 1) {
- min_distances = is_row_in_samples * distances;
- } else {
- min_distances = min (min_distances, distances);
-} }
-
-# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:
-
-termination_code = matrix (0, rows = num_runs, cols = 1);
-final_wcss = matrix (0, rows = num_runs, cols = 1);
-num_iterations = matrix (0, rows = num_runs, cols = 1);
-
-print ("Performing k-means iterations for all runs...");
-
-parfor (run_index in 1 : num_runs, check = 0)
-{
- C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ];
- C_old = C;
- iter_count = 0;
- term_code = 0;
- wcss = 0;
-
- while (term_code == 0)
- {
- # Compute Euclidean squared distances from records (X rows) to centroids (C rows)
- # without the C-independent term, then take the minimum for each record
- D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
- minD = rowMins (D);
- # Compute the current centroid-based within-cluster sum of squares (WCSS)
- wcss_old = wcss;
- wcss = sumXsq + sum (minD);
- if (is_verbose == 1) {
- if (iter_count == 0) {
- print ("Run " + run_index + ", At Start-Up: Centroid WCSS = " + wcss);
- } else {
- print ("Run " + run_index + ", Iteration " + iter_count + ": Centroid WCSS = " + wcss
- + "; Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids));
- } }
- # Check if convergence or maximum iteration has been reached
- if (wcss_old - wcss < eps * wcss & iter_count > 0) {
- term_code = 1; # Convergence is reached
- } else {
- if (iter_count >= max_iter) {
- term_code = 2; # Maximum iteration is reached
- } else {
- iter_count = iter_count + 1;
- # Find the closest centroid for each record
- P = ppred (D, minD, "<=");
- # If some records belong to multiple centroids, share them equally
- P = P / rowSums (P);
- # Compute the column normalization factor for P
- P_denom = colSums (P);
- if (sum (ppred (P_denom, 0.0, "<=")) > 0) {
- term_code = 3; # There is a "runaway" centroid with 0.0 denominator
- } else {
- C_old = C;
- # Compute new centroids as weighted averages over the records
- C = (t(P) %*% X) / t(P_denom);
- } } } }
- print ("Run " + run_index + ", Iteration " + iter_count + ": Terminated with code = " + term_code + ", Centroid WCSS = " + wcss);
- All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C;
- final_wcss [run_index, 1] = wcss;
- termination_code [run_index, 1] = term_code;
- num_iterations [run_index, 1] = iter_count;
-}
-
-# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:
-
-termination_bitmap = matrix (0, rows = num_runs, cols = 3);
-termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
-termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw;
-termination_stats = colSums (termination_bitmap);
-print ("Number of successful runs = " + as.integer (castAsScalar (termination_stats [1, 1])));
-print ("Number of incomplete runs = " + as.integer (castAsScalar (termination_stats [1, 2])));
-print ("Number of failed runs (with lost centroids) = " + as.integer (castAsScalar (termination_stats [1, 3])));
-
-num_successful_runs = castAsScalar (termination_stats [1, 1]);
-if (num_successful_runs > 0) {
- final_wcss_successful = final_wcss * termination_bitmap [, 1];
- worst_wcss = max (final_wcss_successful);
- best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1]));
- avg_wcss = sum (final_wcss_successful) / num_successful_runs;
- best_index_vector = ppred (final_wcss_successful, best_wcss, "==");
- aggr_best_index_vector = cumsum (best_index_vector);
- best_index = as.integer (sum (ppred (aggr_best_index_vector, 0, "==")) + 1);
- print ("Successful runs: Best run is " + best_index + " with Centroid WCSS = " + best_wcss
- + "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss);
- C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
- print ("Writing out the best-WCSS centroids...");
- write (C, fileC, format=fmtCY);
- if (is_write_Y == 1) {
- print ("Writing out the best-WCSS cluster labels...");
- D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
- P = ppred (D, rowMins (D), "<=");
- aggr_P = t(cumsum (t(P)));
- Y = rowSums (ppred (aggr_P, 0, "==")) + 1
- write (Y, fileY, format=fmtCY);
- }
- print ("DONE.");
-} else {
- stop ("No output is produced. Try increasing the number of iterations and/or runs.");
-}
-
-
-
-get_sample_maps = function (int num_records, int num_samples, int approx_sample_size)
- return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size)
-{
- if (approx_sample_size < num_records) {
- # Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's
- # to get the sample block size (to allocate space):
- sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size)));
- num_rows = sample_block_size * num_samples;
-
- # Generate all samples in parallel by converting uniform random values into random
- # integer skip-ahead intervals and prefix-summing them:
- sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0);
- sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5);
- # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one]
- sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one)
-
- # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1":
- is_sample_rec_id_within_range = ppred (sample_rec_ids, num_records, "<=");
- sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range
- + (num_records + 1) * (1 - is_sample_rec_id_within_range);
-
- # Rearrange all samples (and their out-of-range indicators) into one column-vector:
- sample_rec_ids =
- matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
- is_row_in_samples =
- matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE);
-
- # Use contingency table to create the "sample_maps" matrix that is a vertical concatenation
- # of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere:
- sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
- max_rec_id = ncol (sample_maps_raw);
- if (max_rec_id >= num_records) {
- sample_maps = sample_maps_raw [, 1 : num_records];
- } else {
- sample_maps = matrix (0, rows = num_rows, cols = num_records);
- sample_maps [, 1 : max_rec_id] = sample_maps_raw;
- }
-
- # Create a 0-1-matrix that maps each sample column ID into all row positions of the
- # corresponding sample; map out-of-sample-range positions to row id = num_rows + 1:
- sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1);
- # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c:
- col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size);
- sample_col_map = table (sample_positions, col_positions);
- # Remove the out-of-sample-range positions by cutting off the last row:
- sample_col_map = sample_col_map [1 : (num_rows), ];
-
- } else {
- one_per_record = matrix (1, rows = num_records, cols = 1);
- sample_block_size = num_records;
- sample_maps = matrix (0, rows = (num_records * num_samples), cols = num_records);
- sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples);
- for (i in 1:num_samples) {
- sample_maps [(num_records * (i - 1) + 1) : (num_records * i), ] = diag (one_per_record);
- sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record;
-} } }
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Implements the k-Means clustering algorithm
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------
+# X String --- Location to read matrix X with the input data records
+# k Int --- Number of centroids
+# runs Int 10 Number of runs (with different initial centroids)
+# maxi Int 1000 Maximum number of iterations per run
+# tol Double 0.000001 Tolerance (epsilon) for WCSS change ratio
+# samp Int 50 Average number of records per centroid in data samples
+# C String "C.mtx" Location to store the output matrix with the centroids
+# isY Int 0 0 = do not write Y, 1 = write Y
+# Y String "Y.mtx" Location to store the mapping of records to centroids
+# fmt String "text" Matrix output format, usually "text" or "csv"
+# verb Int 0 0 = do not print per-iteration stats, 1 = print them
+# ----------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx
+# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1
+
+fileX = $X;
+fileY = ifdef ($Y, "Y.mtx");
+fileC = ifdef ($C, "C.mtx");
+
+num_centroids = $k;
+num_runs = ifdef ($runs, 10); # $runs=10;
+max_iter = ifdef ($maxi, 1000); # $maxi=1000;
+eps = ifdef ($tol, 0.000001); # $tol=0.000001;
+is_write_Y = ifdef ($isY, 0); # $isY=0;
+is_verbose = ifdef ($verb, 0); # $verb=0;
+fmtCY = ifdef ($fmt, "text"); # $fmt="text";
+avg_sample_size_per_centroid = ifdef ($samp, 50); # $samp=50;
+
+
+print ("BEGIN K-MEANS SCRIPT");
+print ("Reading X...");
+
+# X : matrix of data points as rows
+X = read (fileX);
+num_records = nrow (X);
+num_features = ncol (X);
+
+sumXsq = sum (X ^ 2);
+# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B))
+
+# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:
+
+print ("Taking data samples for initialization...");
+
+[sample_maps, samples_vs_runs_map, sample_block_size] =
+ get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid);
+
+is_row_in_samples = rowSums (sample_maps);
+X_samples = sample_maps %*% X;
+X_samples_sq_norms = rowSums (X_samples ^ 2);
+
+print ("Initializing the centroids for all runs...");
+All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features);
+
+# We select centroids according to the k-Means++ heuristic applied to a sample of X
+# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids,
+# with the out-of-range X_sample positions in min_distances set to 0.0
+
+min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random
+
+for (i in 1 : num_centroids)
+{
+ # "Matricize" and prefix-sum to compute the cumulative distribution function:
+ min_distances_matrix_form =
+ matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE);
+ cdf_min_distances = cumsum (min_distances_matrix_form);
+
+ # Select the i-th centroid in each sample as a random sample row id with
+ # probability ~ min_distances:
+ random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);
+ threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
+ centroid_ids = t(colSums (ppred (cdf_min_distances, threshold_matrix, "<"))) + 1;
+
+ # Place the selected centroids together, one per run, into a matrix:
+ centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs));
+ centroid_placer_raw =
+ table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids);
+ centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
+ centroids = centroid_placer %*% X_samples;
+
+ # Place the selected centroids into their appropriate slots in All_Centroids:
+ centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
+ centroid_placer_raw =
+ table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1));
+ centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
+ All_Centroids = All_Centroids + centroid_placer %*% centroids;
+
+ # Update min_distances to preserve the loop invariant:
+ distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2)
+ - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
+ if (i == 1) {
+ min_distances = is_row_in_samples * distances;
+ } else {
+ min_distances = min (min_distances, distances);
+} }
+
+# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:
+
+termination_code = matrix (0, rows = num_runs, cols = 1);
+final_wcss = matrix (0, rows = num_runs, cols = 1);
+num_iterations = matrix (0, rows = num_runs, cols = 1);
+
+print ("Performing k-means iterations for all runs...");
+
+parfor (run_index in 1 : num_runs, check = 0)
+{
+ C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ];
+ C_old = C;
+ iter_count = 0;
+ term_code = 0;
+ wcss = 0;
+
+ while (term_code == 0)
+ {
+ # Compute Euclidean squared distances from records (X rows) to centroids (C rows)
+ # without the C-independent term, then take the minimum for each record
+ D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
+ minD = rowMins (D);
+ # Compute the current centroid-based within-cluster sum of squares (WCSS)
+ wcss_old = wcss;
+ wcss = sumXsq + sum (minD);
+ if (is_verbose == 1) {
+ if (iter_count == 0) {
+ print ("Run " + run_index + ", At Start-Up: Centroid WCSS = " + wcss);
+ } else {
+ print ("Run " + run_index + ", Iteration " + iter_count + ": Centroid WCSS = " + wcss
+ + "; Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids));
+ } }
+ # Check if convergence or maximum iteration has been reached
+ if (wcss_old - wcss < eps * wcss & iter_count > 0) {
+ term_code = 1; # Convergence is reached
+ } else {
+ if (iter_count >= max_iter) {
+ term_code = 2; # Maximum iteration is reached
+ } else {
+ iter_count = iter_count + 1;
+ # Find the closest centroid for each record
+ P = ppred (D, minD, "<=");
+ # If some records belong to multiple centroids, share them equally
+ P = P / rowSums (P);
+ # Compute the column normalization factor for P
+ P_denom = colSums (P);
+ if (sum (ppred (P_denom, 0.0, "<=")) > 0) {
+ term_code = 3; # There is a "runaway" centroid with 0.0 denominator
+ } else {
+ C_old = C;
+ # Compute new centroids as weighted averages over the records
+ C = (t(P) %*% X) / t(P_denom);
+ } } } }
+ print ("Run " + run_index + ", Iteration " + iter_count + ": Terminated with code = " + term_code + ", Centroid WCSS = " + wcss);
+ All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C;
+ final_wcss [run_index, 1] = wcss;
+ termination_code [run_index, 1] = term_code;
+ num_iterations [run_index, 1] = iter_count;
+}
+
+# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:
+
+termination_bitmap = matrix (0, rows = num_runs, cols = 3);
+termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
+termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw;
+termination_stats = colSums (termination_bitmap);
+print ("Number of successful runs = " + as.integer (castAsScalar (termination_stats [1, 1])));
+print ("Number of incomplete runs = " + as.integer (castAsScalar (termination_stats [1, 2])));
+print ("Number of failed runs (with lost centroids) = " + as.integer (castAsScalar (termination_stats [1, 3])));
+
+num_successful_runs = castAsScalar (termination_stats [1, 1]);
+if (num_successful_runs > 0) {
+ final_wcss_successful = final_wcss * termination_bitmap [, 1];
+ worst_wcss = max (final_wcss_successful);
+ best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1]));
+ avg_wcss = sum (final_wcss_successful) / num_successful_runs;
+ best_index_vector = ppred (final_wcss_successful, best_wcss, "==");
+ aggr_best_index_vector = cumsum (best_index_vector);
+ best_index = as.integer (sum (ppred (aggr_best_index_vector, 0, "==")) + 1);
+ print ("Successful runs: Best run is " + best_index + " with Centroid WCSS = " + best_wcss
+ + "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss);
+ C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ];
+ print ("Writing out the best-WCSS centroids...");
+ write (C, fileC, format=fmtCY);
+ if (is_write_Y == 1) {
+ print ("Writing out the best-WCSS cluster labels...");
+ D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
+ P = ppred (D, rowMins (D), "<=");
+ aggr_P = t(cumsum (t(P)));
+ Y = rowSums (ppred (aggr_P, 0, "==")) + 1
+ write (Y, fileY, format=fmtCY);
+ }
+ print ("DONE.");
+} else {
+ stop ("No output is produced. Try increasing the number of iterations and/or runs.");
+}
+
+
+
+get_sample_maps = function (int num_records, int num_samples, int approx_sample_size)
+ return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size)
+{
+ if (approx_sample_size < num_records) {
+ # Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's
+ # to get the sample block size (to allocate space):
+ sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size)));
+ num_rows = sample_block_size * num_samples;
+
+ # Generate all samples in parallel by converting uniform random values into random
+ # integer skip-ahead intervals and prefix-summing them:
+ sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0);
+ sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5);
+ # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one]
+ sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one)
+
+ # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1":
+ is_sample_rec_id_within_range = ppred (sample_rec_ids, num_records, "<=");
+ sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range
+ + (num_records + 1) * (1 - is_sample_rec_id_within_range);
+
+ # Rearrange all samples (and their out-of-range indicators) into one column-vector:
+ sample_rec_ids =
+ matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
+ is_row_in_samples =
+ matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE);
+
+ # Use contingency table to create the "sample_maps" matrix that is a vertical concatenation
+ # of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere:
+ sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
+ max_rec_id = ncol (sample_maps_raw);
+ if (max_rec_id >= num_records) {
+ sample_maps = sample_maps_raw [, 1 : num_records];
+ } else {
+ sample_maps = matrix (0, rows = num_rows, cols = num_records);
+ sample_maps [, 1 : max_rec_id] = sample_maps_raw;
+ }
+
+ # Create a 0-1-matrix that maps each sample column ID into all row positions of the
+ # corresponding sample; map out-of-sample-range positions to row id = num_rows + 1:
+ sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1);
+ # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c:
+ col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size);
+ sample_col_map = table (sample_positions, col_positions);
+ # Remove the out-of-sample-range positions by cutting off the last row:
+ sample_col_map = sample_col_map [1 : (num_rows), ];
+
+ } else {
+ one_per_record = matrix (1, rows = num_records, cols = 1);
+ sample_block_size = num_records;
+ sample_maps = matrix (0, rows = (num_records * num_samples), cols = num_records);
+ sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples);
+ for (i in 1:num_samples) {
+ sample_maps [(num_records * (i - 1) + 1) : (num_records * i), ] = diag (one_per_record);
+ sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record;
+} } }
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/LinearRegCG.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/LinearRegCG.dml b/scripts/algorithms/LinearRegCG.dml
index a485d83..ebfa5a4 100644
--- a/scripts/algorithms/LinearRegCG.dml
+++ b/scripts/algorithms/LinearRegCG.dml
@@ -1,286 +1,286 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM
-#
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X String --- Location (on HDFS) to read the matrix X of feature vectors
-# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
-# B String --- Location to store estimated regression parameters (the betas)
-# O String " " Location to write the printed statistics; by default is standard output
-# Log String " " Location to write per-iteration variables for log/debugging purposes
-# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
-# 0 = no intercept, no shifting, no rescaling;
-# 1 = add intercept, but neither shift nor rescale X;
-# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero
-# for highly dependend/sparse/numerous features
-# tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if
-# L2 norm of the beta-residual is less than tolerance * its initial norm
-# maxi Int 0 Maximum number of conjugate gradient iterations, 0 = no maximum
-# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
-# --------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
-# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
-# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
-# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# Col.2: betas for shifted/rescaled X and intercept
-#
-# In addition, some regression statistics are provided in CSV format, one comma-separated
-# name-value pair per each line, as follows:
-#
-# NAME MEANING
-# -------------------------------------------------------------------------------------
-# AVG_TOT_Y Average of the response value Y
-# STDEV_TOT_Y Standard Deviation of the response value Y
-# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
-# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
-# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
-# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
-# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
-# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
-# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
-# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
-# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
-# -------------------------------------------------------------------------------------
-# * The last two statistics are only printed if there is no intercept (icpt=0)
-#
-# The Log file, when requested, contains the following per-iteration variables in CSV
-# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for
-# initial values:
-#
-# NAME MEANING
-# -------------------------------------------------------------------------------------
-# CG_RESIDUAL_NORM L2-norm of Conj.Grad.residual, which is A %*% beta - t(X) %*% y
-# where A = t(X) %*% X + diag (lambda), or a similar quantity
-# CG_RESIDUAL_RATIO Ratio of current L2-norm of Conj.Grad.residual over the initial
-# -------------------------------------------------------------------------------------
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B
-# O=OUTPUT_DIR/Out icpt=2 reg=1.0 tol=0.001 maxi=100 fmt=csv Log=OUTPUT_DIR/log
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileO = ifdef ($O, " ");
-fileLog = ifdef ($Log, " ");
-fmtB = ifdef ($fmt, "text");
-
-intercept_status = ifdef ($icpt, 0); # $icpt=0;
-tolerance = ifdef ($tol, 0.000001); # $tol=0.000001;
-max_iteration = ifdef ($maxi, 0); # $maxi=0;
-regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
-
-print ("BEGIN LINEAR REGRESSION SCRIPT");
-print ("Reading X and Y...");
-X = read (fileX);
-y = read (fileY);
-
-n = nrow (X);
-m = ncol (X);
-ones_n = matrix (1, rows = n, cols = 1);
-zero_cell = matrix (0, rows = 1, cols = 1);
-
-# Introduce the intercept, shift and rescale the columns of X if needed
-
-m_ext = m;
-if (intercept_status == 1 | intercept_status == 2) # add the intercept column
-{
- X = append (X, ones_n);
- m_ext = ncol (X);
-}
-
-scale_lambda = matrix (1, rows = m_ext, cols = 1);
-if (intercept_status == 1 | intercept_status == 2)
-{
- scale_lambda [m_ext, 1] = 0;
-}
-
-if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
-{ # Important assumption: X [, m_ext] = ones_n
- avg_X_cols = t(colSums(X)) / n;
- var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
- is_unsafe = ppred (var_X_cols, 0.0, "<=");
- scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
- scale_X [m_ext, 1] = 1;
- shift_X = - avg_X_cols * scale_X;
- shift_X [m_ext, 1] = 0;
-} else {
- scale_X = matrix (1, rows = m_ext, cols = 1);
- shift_X = matrix (0, rows = m_ext, cols = 1);
-}
-
-# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
-# instead of "X". However, in order to preserve the sparsity of X,
-# we apply the transform associatively to some other part of the expression
-# in which it occurs. To avoid materializing a large matrix, we rewrite it:
-#
-# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
-# ssX_A = diag (scale_X) %*% A;
-# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
-#
-# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
-# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
-
-lambda = scale_lambda * regularization;
-beta_unscaled = matrix (0, rows = m_ext, cols = 1);
-
-if (max_iteration == 0) {
- max_iteration = m_ext;
-}
-i = 0;
-
-# BEGIN THE CONJUGATE GRADIENT ALGORITHM
-print ("Running the CG algorithm...");
-
-r = - t(X) %*% y;
-
-if (intercept_status == 2) {
- r = scale_X * r + shift_X %*% r [m_ext, ];
-}
-
-p = - r;
-norm_r2 = sum (r ^ 2);
-norm_r2_initial = norm_r2;
-norm_r2_target = norm_r2_initial * tolerance ^ 2;
-print ("||r|| initial value = " + sqrt (norm_r2_initial) + ", target value = " + sqrt (norm_r2_target));
-log_str = "CG_RESIDUAL_NORM,0," + sqrt (norm_r2_initial);
-log_str = append (log_str, "CG_RESIDUAL_RATIO,0,1.0");
-
-while (i < max_iteration & norm_r2 > norm_r2_target)
-{
- if (intercept_status == 2) {
- ssX_p = scale_X * p;
- ssX_p [m_ext, ] = ssX_p [m_ext, ] + t(shift_X) %*% p;
- } else {
- ssX_p = p;
- }
-
- q = t(X) %*% (X %*% ssX_p);
-
- if (intercept_status == 2) {
- q = scale_X * q + shift_X %*% q [m_ext, ];
- }
-
- q = q + lambda * p;
- a = norm_r2 / sum (p * q);
- beta_unscaled = beta_unscaled + a * p;
- r = r + a * q;
- old_norm_r2 = norm_r2;
- norm_r2 = sum (r ^ 2);
- p = -r + (norm_r2 / old_norm_r2) * p;
- i = i + 1;
- print ("Iteration " + i + ": ||r|| / ||r init|| = " + sqrt (norm_r2 / norm_r2_initial));
- log_str = append (log_str, "CG_RESIDUAL_NORM," + i + "," + sqrt (norm_r2));
- log_str = append (log_str, "CG_RESIDUAL_RATIO," + i + "," + sqrt (norm_r2 / norm_r2_initial));
-}
-
-if (i >= max_iteration) {
- print ("Warning: the maximum number of iterations has been reached.");
-}
-print ("The CG algorithm is done.");
-# END THE CONJUGATE GRADIENT ALGORITHM
-
-if (intercept_status == 2) {
- beta = scale_X * beta_unscaled;
- beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
-} else {
- beta = beta_unscaled;
-}
-
-print ("Computing the statistics...");
-
-avg_tot = sum (y) / n;
-ss_tot = sum (y ^ 2);
-ss_avg_tot = ss_tot - n * avg_tot ^ 2;
-var_tot = ss_avg_tot / (n - 1);
-y_residual = y - X %*% beta;
-avg_res = sum (y_residual) / n;
-ss_res = sum (y_residual ^ 2);
-ss_avg_res = ss_res - n * avg_res ^ 2;
-
-plain_R2 = 1 - ss_res / ss_avg_tot;
-if (n > m_ext) {
- dispersion = ss_res / (n - m_ext);
- adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
-} else {
- dispersion = 0.0 / 0.0;
- adjusted_R2 = 0.0 / 0.0;
-}
-
-plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
-deg_freedom = n - m - 1;
-if (deg_freedom > 0) {
- var_res = ss_avg_res / deg_freedom;
- adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
-} else {
- var_res = 0.0 / 0.0;
- adjusted_R2_nobias = 0.0 / 0.0;
- print ("Warning: zero or negative number of degrees of freedom.");
-}
-
-plain_R2_vs_0 = 1 - ss_res / ss_tot;
-if (n > m) {
- adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
-} else {
- adjusted_R2_vs_0 = 0.0 / 0.0;
-}
-
-str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
-str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
-str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias
-str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)
-str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
-str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
-str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
-str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
-str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
-if (intercept_status == 0) {
- str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
- str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
-}
-
-if (fileO != " ") {
- write (str, fileO);
-} else {
- print (str);
-}
-
-# Prepare the output matrix
-print ("Writing the output matrix...");
-
-if (intercept_status == 2) {
- beta_out = append (beta, beta_unscaled);
-} else {
- beta_out = beta;
-}
-write (beta_out, fileB, format=fmtB);
-
-if (fileLog != " ") {
- write (log_str, fileLog);
-}
-print ("END LINEAR REGRESSION SCRIPT");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM
+#
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# --------------------------------------------------------------------------------------------
+# X String --- Location (on HDFS) to read the matrix X of feature vectors
+# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
+# B String --- Location to store estimated regression parameters (the betas)
+# O String " " Location to write the printed statistics; by default is standard output
+# Log String " " Location to write per-iteration variables for log/debugging purposes
+# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
+# 0 = no intercept, no shifting, no rescaling;
+# 1 = add intercept, but neither shift nor rescale X;
+# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero
+# for highly dependend/sparse/numerous features
+# tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if
+# L2 norm of the beta-residual is less than tolerance * its initial norm
+# maxi Int 0 Maximum number of conjugate gradient iterations, 0 = no maximum
+# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
+# --------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
+# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
+# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
+# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# Col.2: betas for shifted/rescaled X and intercept
+#
+# In addition, some regression statistics are provided in CSV format, one comma-separated
+# name-value pair per each line, as follows:
+#
+# NAME MEANING
+# -------------------------------------------------------------------------------------
+# AVG_TOT_Y Average of the response value Y
+# STDEV_TOT_Y Standard Deviation of the response value Y
+# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
+# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
+# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
+# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
+# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
+# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
+# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
+# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
+# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
+# -------------------------------------------------------------------------------------
+# * The last two statistics are only printed if there is no intercept (icpt=0)
+#
+# The Log file, when requested, contains the following per-iteration variables in CSV
+# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for
+# initial values:
+#
+# NAME MEANING
+# -------------------------------------------------------------------------------------
+# CG_RESIDUAL_NORM L2-norm of Conj.Grad.residual, which is A %*% beta - t(X) %*% y
+# where A = t(X) %*% X + diag (lambda), or a similar quantity
+# CG_RESIDUAL_RATIO Ratio of current L2-norm of Conj.Grad.residual over the initial
+# -------------------------------------------------------------------------------------
+#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B
+# O=OUTPUT_DIR/Out icpt=2 reg=1.0 tol=0.001 maxi=100 fmt=csv Log=OUTPUT_DIR/log
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileO = ifdef ($O, " ");
+fileLog = ifdef ($Log, " ");
+fmtB = ifdef ($fmt, "text");
+
+intercept_status = ifdef ($icpt, 0); # $icpt=0;
+tolerance = ifdef ($tol, 0.000001); # $tol=0.000001;
+max_iteration = ifdef ($maxi, 0); # $maxi=0;
+regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
+
+print ("BEGIN LINEAR REGRESSION SCRIPT");
+print ("Reading X and Y...");
+X = read (fileX);
+y = read (fileY);
+
+n = nrow (X);
+m = ncol (X);
+ones_n = matrix (1, rows = n, cols = 1);
+zero_cell = matrix (0, rows = 1, cols = 1);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+
+m_ext = m;
+if (intercept_status == 1 | intercept_status == 2) # add the intercept column
+{
+ X = append (X, ones_n);
+ m_ext = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = m_ext, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+ scale_lambda [m_ext, 1] = 0;
+}
+
+if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
+{ # Important assumption: X [, m_ext] = ones_n
+ avg_X_cols = t(colSums(X)) / n;
+ var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
+ is_unsafe = ppred (var_X_cols, 0.0, "<=");
+ scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+ scale_X [m_ext, 1] = 1;
+ shift_X = - avg_X_cols * scale_X;
+ shift_X [m_ext, 1] = 0;
+} else {
+ scale_X = matrix (1, rows = m_ext, cols = 1);
+ shift_X = matrix (0, rows = m_ext, cols = 1);
+}
+
+# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
+# instead of "X". However, in order to preserve the sparsity of X,
+# we apply the transform associatively to some other part of the expression
+# in which it occurs. To avoid materializing a large matrix, we rewrite it:
+#
+# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
+# ssX_A = diag (scale_X) %*% A;
+# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+
+lambda = scale_lambda * regularization;
+beta_unscaled = matrix (0, rows = m_ext, cols = 1);
+
+if (max_iteration == 0) {
+ max_iteration = m_ext;
+}
+i = 0;
+
+# BEGIN THE CONJUGATE GRADIENT ALGORITHM
+print ("Running the CG algorithm...");
+
+r = - t(X) %*% y;
+
+if (intercept_status == 2) {
+ r = scale_X * r + shift_X %*% r [m_ext, ];
+}
+
+p = - r;
+norm_r2 = sum (r ^ 2);
+norm_r2_initial = norm_r2;
+norm_r2_target = norm_r2_initial * tolerance ^ 2;
+print ("||r|| initial value = " + sqrt (norm_r2_initial) + ", target value = " + sqrt (norm_r2_target));
+log_str = "CG_RESIDUAL_NORM,0," + sqrt (norm_r2_initial);
+log_str = append (log_str, "CG_RESIDUAL_RATIO,0,1.0");
+
+while (i < max_iteration & norm_r2 > norm_r2_target)
+{
+ if (intercept_status == 2) {
+ ssX_p = scale_X * p;
+ ssX_p [m_ext, ] = ssX_p [m_ext, ] + t(shift_X) %*% p;
+ } else {
+ ssX_p = p;
+ }
+
+ q = t(X) %*% (X %*% ssX_p);
+
+ if (intercept_status == 2) {
+ q = scale_X * q + shift_X %*% q [m_ext, ];
+ }
+
+ q = q + lambda * p;
+ a = norm_r2 / sum (p * q);
+ beta_unscaled = beta_unscaled + a * p;
+ r = r + a * q;
+ old_norm_r2 = norm_r2;
+ norm_r2 = sum (r ^ 2);
+ p = -r + (norm_r2 / old_norm_r2) * p;
+ i = i + 1;
+ print ("Iteration " + i + ": ||r|| / ||r init|| = " + sqrt (norm_r2 / norm_r2_initial));
+ log_str = append (log_str, "CG_RESIDUAL_NORM," + i + "," + sqrt (norm_r2));
+ log_str = append (log_str, "CG_RESIDUAL_RATIO," + i + "," + sqrt (norm_r2 / norm_r2_initial));
+}
+
+if (i >= max_iteration) {
+ print ("Warning: the maximum number of iterations has been reached.");
+}
+print ("The CG algorithm is done.");
+# END THE CONJUGATE GRADIENT ALGORITHM
+
+if (intercept_status == 2) {
+ beta = scale_X * beta_unscaled;
+ beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
+} else {
+ beta = beta_unscaled;
+}
+
+print ("Computing the statistics...");
+
+avg_tot = sum (y) / n;
+ss_tot = sum (y ^ 2);
+ss_avg_tot = ss_tot - n * avg_tot ^ 2;
+var_tot = ss_avg_tot / (n - 1);
+y_residual = y - X %*% beta;
+avg_res = sum (y_residual) / n;
+ss_res = sum (y_residual ^ 2);
+ss_avg_res = ss_res - n * avg_res ^ 2;
+
+plain_R2 = 1 - ss_res / ss_avg_tot;
+if (n > m_ext) {
+ dispersion = ss_res / (n - m_ext);
+ adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
+} else {
+ dispersion = 0.0 / 0.0;
+ adjusted_R2 = 0.0 / 0.0;
+}
+
+plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
+deg_freedom = n - m - 1;
+if (deg_freedom > 0) {
+ var_res = ss_avg_res / deg_freedom;
+ adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
+} else {
+ var_res = 0.0 / 0.0;
+ adjusted_R2_nobias = 0.0 / 0.0;
+ print ("Warning: zero or negative number of degrees of freedom.");
+}
+
+plain_R2_vs_0 = 1 - ss_res / ss_tot;
+if (n > m) {
+ adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
+} else {
+ adjusted_R2_vs_0 = 0.0 / 0.0;
+}
+
+str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
+str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
+str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias
+str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)
+str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
+str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
+str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
+str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
+str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
+if (intercept_status == 0) {
+ str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
+ str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
+}
+
+if (fileO != " ") {
+ write (str, fileO);
+} else {
+ print (str);
+}
+
+# Prepare the output matrix
+print ("Writing the output matrix...");
+
+if (intercept_status == 2) {
+ beta_out = append (beta, beta_unscaled);
+} else {
+ beta_out = beta;
+}
+write (beta_out, fileB, format=fmtB);
+
+if (fileLog != " ") {
+ write (log_str, fileLog);
+}
+print ("END LINEAR REGRESSION SCRIPT");
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/scripts/algorithms/LinearRegDS.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/LinearRegDS.dml b/scripts/algorithms/LinearRegDS.dml
index 501acc8..0ec663a 100644
--- a/scripts/algorithms/LinearRegDS.dml
+++ b/scripts/algorithms/LinearRegDS.dml
@@ -1,224 +1,224 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + lambda) beta = X^T y
-#
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X String --- Location (on HDFS) to read the matrix X of feature vectors
-# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
-# B String --- Location to store estimated regression parameters (the betas)
-# O String " " Location to write the printed statistics; by default is standard output
-# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
-# 0 = no intercept, no shifting, no rescaling;
-# 1 = add intercept, but neither shift nor rescale X;
-# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero
-# for highly dependend/sparse/numerous features
-# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
-# --------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
-# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
-# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
-# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# Col.2: betas for shifted/rescaled X and intercept
-#
-# In addition, some regression statistics are provided in CSV format, one comma-separated
-# name-value pair per each line, as follows:
-#
-# NAME MEANING
-# -------------------------------------------------------------------------------------
-# AVG_TOT_Y Average of the response value Y
-# STDEV_TOT_Y Standard Deviation of the response value Y
-# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
-# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
-# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
-# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
-# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
-# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
-# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
-# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
-# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
-# -------------------------------------------------------------------------------------
-# * The last two statistics are only printed if there is no intercept (icpt=0)
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B
-# O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileO = ifdef ($O, " ");
-fmtB = ifdef ($fmt, "text");
-
-intercept_status = ifdef ($icpt, 0); # $icpt=0;
-regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
-
-print ("BEGIN LINEAR REGRESSION SCRIPT");
-print ("Reading X and Y...");
-X = read (fileX);
-y = read (fileY);
-
-n = nrow (X);
-m = ncol (X);
-ones_n = matrix (1, rows = n, cols = 1);
-zero_cell = matrix (0, rows = 1, cols = 1);
-
-# Introduce the intercept, shift and rescale the columns of X if needed
-
-m_ext = m;
-if (intercept_status == 1 | intercept_status == 2) # add the intercept column
-{
- X = append (X, ones_n);
- m_ext = ncol (X);
-}
-
-scale_lambda = matrix (1, rows = m_ext, cols = 1);
-if (intercept_status == 1 | intercept_status == 2)
-{
- scale_lambda [m_ext, 1] = 0;
-}
-
-if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
-{ # Important assumption: X [, m_ext] = ones_n
- avg_X_cols = t(colSums(X)) / n;
- var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
- is_unsafe = ppred (var_X_cols, 0.0, "<=");
- scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
- scale_X [m_ext, 1] = 1;
- shift_X = - avg_X_cols * scale_X;
- shift_X [m_ext, 1] = 0;
-} else {
- scale_X = matrix (1, rows = m_ext, cols = 1);
- shift_X = matrix (0, rows = m_ext, cols = 1);
-}
-
-# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
-# instead of "X". However, in order to preserve the sparsity of X,
-# we apply the transform associatively to some other part of the expression
-# in which it occurs. To avoid materializing a large matrix, we rewrite it:
-#
-# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
-# ssX_A = diag (scale_X) %*% A;
-# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
-#
-# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
-# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
-
-lambda = scale_lambda * regularization;
-
-# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
-
-A = t(X) %*% X;
-b = t(X) %*% y;
-if (intercept_status == 2) {
- A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
- A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
- b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
-}
-A = A + diag (lambda);
-
-print ("Calling the Direct Solver...");
-
-beta_unscaled = solve (A, b);
-
-# END THE DIRECT SOLVE ALGORITHM
-
-if (intercept_status == 2) {
- beta = scale_X * beta_unscaled;
- beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
-} else {
- beta = beta_unscaled;
-}
-
-print ("Computing the statistics...");
-
-avg_tot = sum (y) / n;
-ss_tot = sum (y ^ 2);
-ss_avg_tot = ss_tot - n * avg_tot ^ 2;
-var_tot = ss_avg_tot / (n - 1);
-y_residual = y - X %*% beta;
-avg_res = sum (y_residual) / n;
-ss_res = sum (y_residual ^ 2);
-ss_avg_res = ss_res - n * avg_res ^ 2;
-
-plain_R2 = 1 - ss_res / ss_avg_tot;
-if (n > m_ext) {
- dispersion = ss_res / (n - m_ext);
- adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
-} else {
- dispersion = 0.0 / 0.0;
- adjusted_R2 = 0.0 / 0.0;
-}
-
-plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
-deg_freedom = n - m - 1;
-if (deg_freedom > 0) {
- var_res = ss_avg_res / deg_freedom;
- adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
-} else {
- var_res = 0.0 / 0.0;
- adjusted_R2_nobias = 0.0 / 0.0;
- print ("Warning: zero or negative number of degrees of freedom.");
-}
-
-plain_R2_vs_0 = 1 - ss_res / ss_tot;
-if (n > m) {
- adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
-} else {
- adjusted_R2_vs_0 = 0.0 / 0.0;
-}
-
-str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
-str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
-str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias
-str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)
-str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
-str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
-str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
-str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
-str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
-if (intercept_status == 0) {
- str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
- str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
-}
-
-if (fileO != " ") {
- write (str, fileO);
-} else {
- print (str);
-}
-
-# Prepare the output matrix
-print ("Writing the output matrix...");
-
-if (intercept_status == 2) {
- beta_out = append (beta, beta_unscaled);
-} else {
- beta_out = beta;
-}
-write (beta_out, fileB, format=fmtB);
-print ("END LINEAR REGRESSION SCRIPT");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + lambda) beta = X^T y
+#
+# INPUT PARAMETERS:
+# --------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# --------------------------------------------------------------------------------------------
+# X String --- Location (on HDFS) to read the matrix X of feature vectors
+# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
+# B String --- Location to store estimated regression parameters (the betas)
+# O String " " Location to write the printed statistics; by default is standard output
+# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
+# 0 = no intercept, no shifting, no rescaling;
+# 1 = add intercept, but neither shift nor rescale X;
+# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero
+# for highly dependend/sparse/numerous features
+# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
+# --------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
+# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
+# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
+# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# Col.2: betas for shifted/rescaled X and intercept
+#
+# In addition, some regression statistics are provided in CSV format, one comma-separated
+# name-value pair per each line, as follows:
+#
+# NAME MEANING
+# -------------------------------------------------------------------------------------
+# AVG_TOT_Y Average of the response value Y
+# STDEV_TOT_Y Standard Deviation of the response value Y
+# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
+# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X)
+# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr.
+# PLAIN_R2 Plain R^2 of residual with bias included vs. total average
+# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average
+# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average
+# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average
+# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant
+# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant
+# -------------------------------------------------------------------------------------
+# * The last two statistics are only printed if there is no intercept (icpt=0)
+#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B
+# O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileO = ifdef ($O, " ");
+fmtB = ifdef ($fmt, "text");
+
+intercept_status = ifdef ($icpt, 0); # $icpt=0;
+regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
+
+print ("BEGIN LINEAR REGRESSION SCRIPT");
+print ("Reading X and Y...");
+X = read (fileX);
+y = read (fileY);
+
+n = nrow (X);
+m = ncol (X);
+ones_n = matrix (1, rows = n, cols = 1);
+zero_cell = matrix (0, rows = 1, cols = 1);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+
+m_ext = m;
+if (intercept_status == 1 | intercept_status == 2) # add the intercept column
+{
+ X = append (X, ones_n);
+ m_ext = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = m_ext, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+ scale_lambda [m_ext, 1] = 0;
+}
+
+if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1
+{ # Important assumption: X [, m_ext] = ones_n
+ avg_X_cols = t(colSums(X)) / n;
+ var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
+ is_unsafe = ppred (var_X_cols, 0.0, "<=");
+ scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+ scale_X [m_ext, 1] = 1;
+ shift_X = - avg_X_cols * scale_X;
+ shift_X [m_ext, 1] = 0;
+} else {
+ scale_X = matrix (1, rows = m_ext, cols = 1);
+ shift_X = matrix (0, rows = m_ext, cols = 1);
+}
+
+# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
+# instead of "X". However, in order to preserve the sparsity of X,
+# we apply the transform associatively to some other part of the expression
+# in which it occurs. To avoid materializing a large matrix, we rewrite it:
+#
+# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
+# ssX_A = diag (scale_X) %*% A;
+# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+
+lambda = scale_lambda * regularization;
+
+# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
+
+A = t(X) %*% X;
+b = t(X) %*% y;
+if (intercept_status == 2) {
+ A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
+ A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+ b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
+}
+A = A + diag (lambda);
+
+print ("Calling the Direct Solver...");
+
+beta_unscaled = solve (A, b);
+
+# END THE DIRECT SOLVE ALGORITHM
+
+if (intercept_status == 2) {
+ beta = scale_X * beta_unscaled;
+ beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
+} else {
+ beta = beta_unscaled;
+}
+
+print ("Computing the statistics...");
+
+avg_tot = sum (y) / n;
+ss_tot = sum (y ^ 2);
+ss_avg_tot = ss_tot - n * avg_tot ^ 2;
+var_tot = ss_avg_tot / (n - 1);
+y_residual = y - X %*% beta;
+avg_res = sum (y_residual) / n;
+ss_res = sum (y_residual ^ 2);
+ss_avg_res = ss_res - n * avg_res ^ 2;
+
+plain_R2 = 1 - ss_res / ss_avg_tot;
+if (n > m_ext) {
+ dispersion = ss_res / (n - m_ext);
+ adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
+} else {
+ dispersion = 0.0 / 0.0;
+ adjusted_R2 = 0.0 / 0.0;
+}
+
+plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
+deg_freedom = n - m - 1;
+if (deg_freedom > 0) {
+ var_res = ss_avg_res / deg_freedom;
+ adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
+} else {
+ var_res = 0.0 / 0.0;
+ adjusted_R2_nobias = 0.0 / 0.0;
+ print ("Warning: zero or negative number of degrees of freedom.");
+}
+
+plain_R2_vs_0 = 1 - ss_res / ss_tot;
+if (n > m) {
+ adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
+} else {
+ adjusted_R2_vs_0 = 0.0 / 0.0;
+}
+
+str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y
+str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y
+str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias
+str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X)
+str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f.
+str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average
+str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average
+str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average
+str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average
+if (intercept_status == 0) {
+ str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant
+ str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant
+}
+
+if (fileO != " ") {
+ write (str, fileO);
+} else {
+ print (str);
+}
+
+# Prepare the output matrix
+print ("Writing the output matrix...");
+
+if (intercept_status == 2) {
+ beta_out = append (beta, beta_unscaled);
+} else {
+ beta_out = beta;
+}
+write (beta_out, fileB, format=fmtB);
+print ("END LINEAR REGRESSION SCRIPT");