You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:15 UTC
[39/51] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4LogReg_LTstats.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4LogReg_LTstats.dml b/scripts/datagen/genRandData4LogReg_LTstats.dml
index 6742f0c..2ec5aef 100644
--- a/scripts/datagen/genRandData4LogReg_LTstats.dml
+++ b/scripts/datagen/genRandData4LogReg_LTstats.dml
@@ -1,233 +1,233 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# generates random data to test bi- and multinomial logistic regression
-
-# $N = number of training samples
-# $Nt = number of test samples (or 0 if none)
-# $nf = number of features (independent variables)
-# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
-# $Xmin = minimum feature value
-# $Xmax = maximum feature value
-# $spars = controls sparsity in the generated data
-# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
-# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
-# $stdLT = requested standard deviation for the linear terms
-# $iceptmin = intercept, minimum value (0.0 disables intercept)
-# $iceptmax = intercept, maximum value (0.0 disables intercept)
-# $B = location to store generated regression parameters
-# $X = location to store generated training data
-# $Y = location to store generated training category labels
-# $Xt = location to store generated test data
-# $Yt = location to store generated test category labels
-#
-# Example:
-# hadoop jar SystemML.jar -f genRandData4LogReg_LTstats.dml -nvargs
-# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25
-# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123
-
-numTrainingSamples = $N;
-numTestSamples = $Nt;
-numFeatures = $nf;
-numCategories = $nc;
-minIntercept = $iceptmin;
-maxIntercept = $iceptmax;
-minXentry = $Xmin;
-maxXentry = $Xmax;
-minAvgLT = $avgLTmin;
-maxAvgLT = $avgLTmax;
-sparsityLevel = $spars;
-stdevLT = $stdLT;
-fileB = ifdef ($B, "B");
-fileX = ifdef ($X, "X");
-fileY = ifdef ($Y, "Y");
-fileXt = ifdef ($Xt, "Xt");
-fileYt = ifdef ($Yt, "Yt");
-
-
-numSamples = numTrainingSamples + numTestSamples;
-
-isBinomialPMOne = FALSE;
-if (numCategories == 1) {
- numCategories = 2;
- isBinomialPMOne = TRUE;
-}
-do_we_output_intercept = 1;
-if (minIntercept == 0.0 & maxIntercept == 0.0) {
- do_we_output_intercept = 0;
-}
-
-X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel);
-
-meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform");
-sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
-b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform");
-
-meanLT_minus_intercept = meanLT - b_intercept;
-[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
-
-ones = matrix (1.0, rows = numSamples, cols = 1);
-LT = X %*% B + ones %*% b_intercept;
-actual_meanLT = colSums (LT) / numSamples;
-actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
-
-for (i in 1:(numCategories - 1)) {
- if (castAsScalar (new_sigmaLT [1, i]) == castAsScalar (sigmaLT [1, i])) {
- print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i]));
- } else {
- print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i]) + ", st.dev.(LT) relaxed from " + castAsScalar (sigmaLT [1, i]));
- }
- print (" Wanted LT mean = " + castAsScalar (meanLT [1, i]) + ", st.dev. = " + castAsScalar (new_sigmaLT [1, i]));
- print (" Actual LT mean = " + castAsScalar (actual_meanLT [1, i]) + ", st.dev. = " + castAsScalar (actual_sigmaLT [1, i]));
-}
-
-
-ones = matrix (1.0, rows = 1, cols = numCategories - 1);
-Prob = exp (LT);
-Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0);
-R = r %*% ones;
-Y = 1 + rowSums (ppred (Prob, R, "<"));
-if (isBinomialPMOne) {
- Y = 3 - 2 * Y;
-}
-
-
-/* USE FOR LINEAR REGRESSION
-
-r = Rand (rows = numSamples, cols = 1, pdf = "normal");
-Y = LT [, 1] + r;
-
-*/
-
-
-if (do_we_output_intercept == 1) {
- new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
- new_B [1:nrow(B), 1:ncol(B)] = B;
- new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
- write (new_B, fileB, format="mm");
-} else {
- write (B, fileB, format="mm");
-}
-
-if (numTestSamples > 0) {
- X_train = X [1:numTrainingSamples,];
- Y_train = Y [1:numTrainingSamples,];
- X_test = X [(numTrainingSamples+1):numSamples,];
- Y_test = Y [(numTrainingSamples+1):numSamples,];
- write (X_train, fileX, format="mm");
- write (Y_train, fileY, format="mm");
- write (X_test, fileXt, format="mm");
- write (Y_test, fileYt, format="mm");
-} else {
- write (X, fileX, format="mm");
- write (Y, fileY, format="mm");
-}
-
-
-
-
-
-
-# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W
-# To be used for data generation in the testing of GLM, Logistic Regression, etc.
-# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are
-# the desired mean and standard deviation for X %*% W[, i]
-# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
-# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced,
-# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this constraint.
-generateWeights =
- function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
- return (Matrix[double] W, Matrix[double] new_sigmaLT)
-{
- num_w = ncol (meanLT); # Number of output weight vectors
- dim_w = ncol (X); # Number of features / dimensions in a weight vector
- w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes meanLT
- # (all orthogonal shift directions do not affect meanLT)
-
- # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
-
- w_1 = straightenX (X);
- r_1 = (X %*% w_1) - 1.0;
- norm_r_1_sq = sum (r_1 ^ 2);
-
- # For each W[, i] generate uniformly random directions to shift away from "w_1"
-
- DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
- DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X
- XDW = X %*% DW;
-
- # Determine how far to shift in the chosen directions to satisfy the constraints
- # Use the positive root of the quadratic equation; relax sigmaLT where needed
-
- a_qe = colSums (XDW ^ 2);
- b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
- c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
-
- is_sigmaLT_OK = ppred (c_qe, 0.0, "<=");
- new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X));
- c_qe = is_sigmaLT_OK * c_qe;
- x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
-
- # Scale and shift "w_1" in the "DW" directions to produce the result:
-
- ones = matrix (1.0, rows = dim_w, cols = 1);
- W = w_1 %*% meanLT + DW * (ones %*% x_qe);
-}
-
-# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X).
-straightenX =
- function (Matrix[double] X)
- return (Matrix[double] w)
-{
- w_X = t(colSums(X));
- lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
- eps = 0.000000001 * nrow(X);
-
- # BEGIN LEAST SQUARES
-
- r_LS = - w_X;
- z_LS = matrix (0.0, rows = ncol(X), cols = 1);
- p_LS = - r_LS;
- norm_r2_LS = sum (r_LS ^ 2);
- i_LS = 0;
- while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
- {
- temp_LS = X %*% p_LS;
- q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
- alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
- z_LS = z_LS + alpha_LS * p_LS;
- old_norm_r2_LS = norm_r2_LS;
- r_LS = r_LS + alpha_LS * q_LS;
- norm_r2_LS = sum (r_LS ^ 2);
- p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
- i_LS = i_LS + 1;
- }
-
- # END LEAST SQUARES
-
- w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# generates random data to test bi- and multinomial logistic regression
+
+# $N = number of training samples
+# $Nt = number of test samples (or 0 if none)
+# $nf = number of features (independent variables)
+# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
+# $Xmin = minimum feature value
+# $Xmax = maximum feature value
+# $spars = controls sparsity in the generated data
+# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
+# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
+# $stdLT = requested standard deviation for the linear terms
+# $iceptmin = intercept, minimum value (0.0 disables intercept)
+# $iceptmax = intercept, maximum value (0.0 disables intercept)
+# $B = location to store generated regression parameters
+# $X = location to store generated training data
+# $Y = location to store generated training category labels
+# $Xt = location to store generated test data
+# $Yt = location to store generated test category labels
+#
+# Example:
+# hadoop jar SystemML.jar -f genRandData4LogReg_LTstats.dml -nvargs
+# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25
+# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123
+
+numTrainingSamples = $N;
+numTestSamples = $Nt;
+numFeatures = $nf;
+numCategories = $nc;
+minIntercept = $iceptmin;
+maxIntercept = $iceptmax;
+minXentry = $Xmin;
+maxXentry = $Xmax;
+minAvgLT = $avgLTmin;
+maxAvgLT = $avgLTmax;
+sparsityLevel = $spars;
+stdevLT = $stdLT;
+fileB = ifdef ($B, "B");
+fileX = ifdef ($X, "X");
+fileY = ifdef ($Y, "Y");
+fileXt = ifdef ($Xt, "Xt");
+fileYt = ifdef ($Yt, "Yt");
+
+
+numSamples = numTrainingSamples + numTestSamples;
+
+isBinomialPMOne = FALSE;
+if (numCategories == 1) {
+ numCategories = 2;
+ isBinomialPMOne = TRUE;
+}
+do_we_output_intercept = 1;
+if (minIntercept == 0.0 & maxIntercept == 0.0) {
+ do_we_output_intercept = 0;
+}
+
+X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel);
+
+meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform");
+sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
+b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform");
+
+meanLT_minus_intercept = meanLT - b_intercept;
+[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
+
+ones = matrix (1.0, rows = numSamples, cols = 1);
+LT = X %*% B + ones %*% b_intercept;
+actual_meanLT = colSums (LT) / numSamples;
+actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
+
+for (i in 1:(numCategories - 1)) {
+ if (castAsScalar (new_sigmaLT [1, i]) == castAsScalar (sigmaLT [1, i])) {
+ print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i]));
+ } else {
+ print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i]) + ", st.dev.(LT) relaxed from " + castAsScalar (sigmaLT [1, i]));
+ }
+ print (" Wanted LT mean = " + castAsScalar (meanLT [1, i]) + ", st.dev. = " + castAsScalar (new_sigmaLT [1, i]));
+ print (" Actual LT mean = " + castAsScalar (actual_meanLT [1, i]) + ", st.dev. = " + castAsScalar (actual_sigmaLT [1, i]));
+}
+
+
+ones = matrix (1.0, rows = 1, cols = numCategories - 1);
+Prob = exp (LT);
+Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0);
+R = r %*% ones;
+Y = 1 + rowSums (ppred (Prob, R, "<"));
+if (isBinomialPMOne) {
+ Y = 3 - 2 * Y;
+}
+
+
+/* USE FOR LINEAR REGRESSION
+
+r = Rand (rows = numSamples, cols = 1, pdf = "normal");
+Y = LT [, 1] + r;
+
+*/
+
+
+if (do_we_output_intercept == 1) {
+ new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
+ new_B [1:nrow(B), 1:ncol(B)] = B;
+ new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
+ write (new_B, fileB, format="mm");
+} else {
+ write (B, fileB, format="mm");
+}
+
+if (numTestSamples > 0) {
+ X_train = X [1:numTrainingSamples,];
+ Y_train = Y [1:numTrainingSamples,];
+ X_test = X [(numTrainingSamples+1):numSamples,];
+ Y_test = Y [(numTrainingSamples+1):numSamples,];
+ write (X_train, fileX, format="mm");
+ write (Y_train, fileY, format="mm");
+ write (X_test, fileXt, format="mm");
+ write (Y_test, fileYt, format="mm");
+} else {
+ write (X, fileX, format="mm");
+ write (Y, fileY, format="mm");
+}
+
+
+
+
+
+
+# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W
+# To be used for data generation in the testing of GLM, Logistic Regression, etc.
+# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are
+# the desired mean and standard deviation for X %*% W[, i]
+# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
+# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced,
+# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this constraint.
+generateWeights =
+ function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
+ return (Matrix[double] W, Matrix[double] new_sigmaLT)
+{
+ num_w = ncol (meanLT); # Number of output weight vectors
+ dim_w = ncol (X); # Number of features / dimensions in a weight vector
+ w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes meanLT
+ # (all orthogonal shift directions do not affect meanLT)
+
+ # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
+
+ w_1 = straightenX (X);
+ r_1 = (X %*% w_1) - 1.0;
+ norm_r_1_sq = sum (r_1 ^ 2);
+
+ # For each W[, i] generate uniformly random directions to shift away from "w_1"
+
+ DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
+ DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X
+ XDW = X %*% DW;
+
+ # Determine how far to shift in the chosen directions to satisfy the constraints
+ # Use the positive root of the quadratic equation; relax sigmaLT where needed
+
+ a_qe = colSums (XDW ^ 2);
+ b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
+ c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
+
+ is_sigmaLT_OK = ppred (c_qe, 0.0, "<=");
+ new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X));
+ c_qe = is_sigmaLT_OK * c_qe;
+ x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
+
+ # Scale and shift "w_1" in the "DW" directions to produce the result:
+
+ ones = matrix (1.0, rows = dim_w, cols = 1);
+ W = w_1 %*% meanLT + DW * (ones %*% x_qe);
+}
+
+# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X).
+straightenX =
+ function (Matrix[double] X)
+ return (Matrix[double] w)
+{
+ w_X = t(colSums(X));
+ lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+ eps = 0.000000001 * nrow(X);
+
+ # BEGIN LEAST SQUARES
+
+ r_LS = - w_X;
+ z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+ p_LS = - r_LS;
+ norm_r2_LS = sum (r_LS ^ 2);
+ i_LS = 0;
+ while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
+ {
+ temp_LS = X %*% p_LS;
+ q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
+ alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+ z_LS = z_LS + alpha_LS * p_LS;
+ old_norm_r2_LS = norm_r2_LS;
+ r_LS = r_LS + alpha_LS * q_LS;
+ norm_r2_LS = sum (r_LS ^ 2);
+ p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+ i_LS = i_LS + 1;
+ }
+
+ # END LEAST SQUARES
+
+ w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4MultiClassSVM.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4MultiClassSVM.dml b/scripts/datagen/genRandData4MultiClassSVM.dml
index 65ee1d4..5d9fbcb 100644
--- a/scripts/datagen/genRandData4MultiClassSVM.dml
+++ b/scripts/datagen/genRandData4MultiClassSVM.dml
@@ -1,68 +1,68 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data to test linear logistic regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10)
-X = X * maxFeatureValue
-
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-w = w * maxWeight
-
-ot = X%*%w
-if(b!=0) {
- b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
- w = t(append(t(w), b_mat))
- ot = ot + b
-}
-
-prob = 1/(1+exp(-ot))
-if(addNoise == 1){
- r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-}else{
- print("this data generator generates the same dataset for both noise=0 and noise=1")
- r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
- #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
-}
-Y = 1 - 2*ppred(prob, r, "<")
-Y = (Y+3)/2
-
-write(w, $5, format="binary")
-write(X, $6, format="binary")
-write(Y, $7, format="binary")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data to test linear logistic regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10)
+X = X * maxFeatureValue
+
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+w = w * maxWeight
+
+ot = X%*%w
+if(b!=0) {
+ b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+ w = t(append(t(w), b_mat))
+ ot = ot + b
+}
+
+prob = 1/(1+exp(-ot))
+if(addNoise == 1){
+ r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+}else{
+ print("this data generator generates the same dataset for both noise=0 and noise=1")
+ r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+ #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
+}
+Y = 1 - 2*ppred(prob, r, "<")
+Y = (Y+3)/2
+
+write(w, $5, format="binary")
+write(X, $6, format="binary")
+write(Y, $7, format="binary")
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4NMF.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4NMF.dml b/scripts/datagen/genRandData4NMF.dml
index 5988d48..cf18430 100644
--- a/scripts/datagen/genRandData4NMF.dml
+++ b/scripts/datagen/genRandData4NMF.dml
@@ -1,129 +1,129 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-# (may overlap). use this to vary
-# sparsity.
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = ppred(denomsTM, 0, "==")
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
- docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
- docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
- topicDist = topicDistributions[i,]
-
- denom2 = sum(topicDist)
- if(denom2 == 0){
- denom2 = denom2 + 0.1
- }
-
- topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
- topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
-}
-
-data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-parfor(i in 1:numDocuments){
- docTopic = docTopicMixtures[i,]
-
- ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
-
- r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
- r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
-
- for(j in 1:numWordsPerDoc){
- rz = castAsScalar(r_z[j,1])
- continue = 1
-
- z = -1
- #this is a workaround
- #z=1
-
- for(k1 in 1:numTopics){
- prob = castAsScalar(docTopic[1,k1])
- if(continue==1 & rz <= prob){
- z=k1
- continue=0
- }
- }
-
- if(z==-1){
- print("z is unassigned: " + z)
- z = numTopics
- }
-
- rw = castAsScalar(r_w[j,1])
- continue = 1
-
- w = -1
- #this is a workaround
- #w = 1
-
- for(k2 in 1:numFeatures){
- prob = castAsScalar(topicDistributions[z,k2])
- if(continue == 1 & rw <= prob){
- w = k2
- continue = 0
- }
- }
-
- if(w==-1){
- print("w is unassigned: " + w)
- w = numFeatures
- }
-
- ldata[1,w] = ldata[1,w] + 1
- }
-
- data[i,] = ldata;
-}
-
-write(data, $7, format="binary")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+# (may overlap). use this to vary
+# sparsity.
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = ppred(denomsTM, 0, "==")
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+ docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+ docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+ topicDist = topicDistributions[i,]
+
+ denom2 = sum(topicDist)
+ if(denom2 == 0){
+ denom2 = denom2 + 0.1
+ }
+
+ topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+ topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
+}
+
+data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+parfor(i in 1:numDocuments){
+ docTopic = docTopicMixtures[i,]
+
+ ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
+
+ r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+ r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+
+ for(j in 1:numWordsPerDoc){
+ rz = castAsScalar(r_z[j,1])
+ continue = 1
+
+ z = -1
+ #this is a workaround
+ #z=1
+
+ for(k1 in 1:numTopics){
+ prob = castAsScalar(docTopic[1,k1])
+ if(continue==1 & rz <= prob){
+ z=k1
+ continue=0
+ }
+ }
+
+ if(z==-1){
+ print("z is unassigned: " + z)
+ z = numTopics
+ }
+
+ rw = castAsScalar(r_w[j,1])
+ continue = 1
+
+ w = -1
+ #this is a workaround
+ #w = 1
+
+ for(k2 in 1:numFeatures){
+ prob = castAsScalar(topicDistributions[z,k2])
+ if(continue == 1 & rw <= prob){
+ w = k2
+ continue = 0
+ }
+ }
+
+ if(w==-1){
+ print("w is unassigned: " + w)
+ w = numFeatures
+ }
+
+ ldata[1,w] = ldata[1,w] + 1
+ }
+
+ data[i,] = ldata;
+}
+
+write(data, $7, format="binary")
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4NMFBlockwise.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4NMFBlockwise.dml b/scripts/datagen/genRandData4NMFBlockwise.dml
index 63133da..e3fd67f 100644
--- a/scripts/datagen/genRandData4NMFBlockwise.dml
+++ b/scripts/datagen/genRandData4NMFBlockwise.dml
@@ -1,138 +1,138 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-# (may overlap). use this to vary
-# sparsity.
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-#
-# $8 is the blocksize, i.e., number of rows per block
-# (should be set such that $8x$2 fits in mem budget)
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-blocksize = $8
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = ppred(denomsTM, 0, "==")
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
- docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
- docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
- topicDist = topicDistributions[i,]
-
- denom2 = sum(topicDist)
- if(denom2 == 0){
- denom2 = denom2 + 0.1
- }
-
- topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
- topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
-}
-
-data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-#outer-loop for blockwise computation
-for( k in seq(1,numDocuments,blocksize) )
-{
- len = min(blocksize,numDocuments-k); #block length
- data = data0[k:(k+len),]; #obtain block
-
- parfor(i in 1:len){
- docTopic = docTopicMixtures[i,]
-
- r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
- r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
-
- for(j in 1:numWordsPerDoc){
- rz = castAsScalar(r_z[j,1])
- continue = 1
-
- z = -1
- #this is a workaround
- #z=1
-
- for(k1 in 1:numTopics){
- prob = castAsScalar(docTopic[1,k1])
- if(continue==1 & rz <= prob){
- z=k1
- continue=0
- }
- }
-
- if(z==-1){
- print("z is unassigned: " + z)
- z = numTopics
- }
-
- rw = castAsScalar(r_w[j,1])
- continue = 1
-
- w = -1
- #this is a workaround
- #w = 1
-
- for(k2 in 1:numFeatures){
- prob = castAsScalar(topicDistributions[z,k2])
- if(continue == 1 & rw <= prob){
- w = k2
- continue = 0
- }
- }
-
- if(w==-1){
- print("w is unassigned: " + w)
- w = numFeatures
- }
-
- data[i,w] = data[i,w] + 1
- }
- }
-
- data0[k:(k+len),] = data; # write block back
-}
-
-write(data0, $7, format="binary")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+# (may overlap). use this to vary
+# sparsity.
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+#
+# $8 is the blocksize, i.e., number of rows per block
+# (should be set such that $8x$2 fits in mem budget)
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+blocksize = $8
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = ppred(denomsTM, 0, "==")
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+ docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+ docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+ topicDist = topicDistributions[i,]
+
+ denom2 = sum(topicDist)
+ if(denom2 == 0){
+ denom2 = denom2 + 0.1
+ }
+
+ topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+ topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
+}
+
+data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+#outer-loop for blockwise computation
+for( k in seq(1,numDocuments,blocksize) )
+{
+ len = min(blocksize,numDocuments-k); #block length
+ data = data0[k:(k+len),]; #obtain block
+
+ parfor(i in 1:len){
+ docTopic = docTopicMixtures[i,]
+
+ r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+ r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+
+ for(j in 1:numWordsPerDoc){
+ rz = castAsScalar(r_z[j,1])
+ continue = 1
+
+ z = -1
+ #this is a workaround
+ #z=1
+
+ for(k1 in 1:numTopics){
+ prob = castAsScalar(docTopic[1,k1])
+ if(continue==1 & rz <= prob){
+ z=k1
+ continue=0
+ }
+ }
+
+ if(z==-1){
+ print("z is unassigned: " + z)
+ z = numTopics
+ }
+
+ rw = castAsScalar(r_w[j,1])
+ continue = 1
+
+ w = -1
+ #this is a workaround
+ #w = 1
+
+ for(k2 in 1:numFeatures){
+ prob = castAsScalar(topicDistributions[z,k2])
+ if(continue == 1 & rw <= prob){
+ w = k2
+ continue = 0
+ }
+ }
+
+ if(w==-1){
+ print("w is unassigned: " + w)
+ w = numFeatures
+ }
+
+ data[i,w] = data[i,w] + 1
+ }
+ }
+
+ data0[k:(k+len),] = data; # write block back
+}
+
+write(data0, $7, format="binary")
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4SurvAnalysis.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4SurvAnalysis.dml b/scripts/datagen/genRandData4SurvAnalysis.dml
index 7ac235b..da94a22 100644
--- a/scripts/datagen/genRandData4SurvAnalysis.dml
+++ b/scripts/datagen/genRandData4SurvAnalysis.dml
@@ -1,133 +1,133 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL HAZARD MODELS
-# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTIRUTION WITH PARAMETERS LAMBDA AND V
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# type Sting --- The type of model for which the data is being generated: "kaplan-meier" or "cox"
-# n Int Number of records
-# lambda Double 2.0 Scale parameter of the Weibull distribution used for generating timestamps
-# v Double 1.5 Shape parameter of the Weibull distribution used for generating timestamps
-# p Double 0.8 1 - probability of a record being censored
-# g Int 2 If type=kaplan-meier the number of categorical features used for grouping
-# s Int 1 If type=kaplan-meier the number of categorical features used for stratifying
-# f Int 10 If type=kaplan-meier maximum number of levels (i.e., distinct values) of g+s categorical features
-# m Int 100 If type=cox the number of features in the model
-# sp Double 1.0 If type=cox the sparsity of the feature matrix
-# O String --- Location to write the output matrix containing random data for the kaplan-meier or the cox model
-# B String --- If type=cox location to write the output matrix containing the coefficients for the cox model
-# TE String --- Location to store column indices of X corresponding to timestamp (first row) and event information (second row)
-# F String --- Location to store column indices of X which are to be used for fitting the Cox model
-# fmt String "text" The output format of results of the kaplan-meier analysis, such as "text" or "csv"
-# ---------------------------------------------------------------------------------------------
-# OUTPUTS:
-# 1- If type=kaplan-meier an n x (2+g+s) matrix O with
-# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
-# - column 2 contains the information whether an event occurred (1) or data is censored (0)
-# - columns 3:2+g contain categorical features used for grouping
-# - columns 3+g:2+g+s contain categorical features used for stratifying
-# if type=cox an n x (2+m) matrix O with
-# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
-# - column 2 contains the information whether an event occurred (1) or data is censored (0)
-# - columns 3:2+m contain scale features
-# 2- If type=cox a coefficient matrix B
-# 3- A colum matrix TE containing the column indices of X corresponding to timestamp (first row) and event information (second row)
-# 4- A column matrix F containing the column indices of X which are to be used for KM analysis or fitting the Cox model
-
-type = $type; # either "kaplan-meier" or "cox"
-num_records = $n;
-lambda = ifdef ($l, 2.0);
-p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
-# parameters related to the kaplan-meier model
-n_groups = ifdef ($g, 2);
-n_strata = ifdef ($s, 1);
-max_level = ifdef ($f, 10);
-# parameters related to the cox model
-num_features = ifdef ($m, 1000);
-sparsity = ifdef ($sp, 1.0);
-fileO = $O;
-fileB = $B;
-fileTE = $TE;
-fileF = $F;
-fmtO = ifdef ($fmt, "text"); # $fmt="text"
-p_censor = 1 - p_event; # prob. that record is censored
-
-if (type == "kaplan-meier") {
-
- v = ifdef ($v, 1.5);
- # generate categorical features used for grouping and stratifying
- X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
-
- # generate timestamps
- U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
- T = (-log (U) / lambda) ^ (1/v);
-
-} else if (type == "cox") {
-
- v = ifdef ($v, 50);
- # generate feature matrix
- X = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf = "uniform", sparsity = sparsity);
-
- # generate coefficients
- B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = "uniform", sparsity = 1.0); # * beta_range;
-
- # generate timestamps
- U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
- T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
-
-} else {
- stop ("Wrong model type!");
-}
-
-Y = matrix (0, rows = num_records, cols = 2);
-event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = (1 + p_event)));
-n_time = sum (event);
-Y[,2] = event;
-
-# binning of event times
-min_T = min (T);
-max_T = max (T);
-# T = T - min_T;
-len = max_T - min_T;
-num_bins = len / n_time;
-T = ceil (T / num_bins);
-
-# print ("min(T) " + min(T) + " max(T) " + max(T));
-Y[,1] = T;
-
-O = append (Y, X);
-write (O, fileO, format = fmtO);
-
-if (type == "cox") {
- write (B, fileB, format = fmtO);
-
-}
-
-TE = matrix ("1 2", rows = 2, cols = 1);
-F = seq (1, num_features);
-write (TE, fileTE, format = fmtO);
-write (F, fileF, format = fmtO);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL HAZARD MODELS
+# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTIRUTION WITH PARAMETERS LAMBDA AND V
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# type Sting --- The type of model for which the data is being generated: "kaplan-meier" or "cox"
+# n Int Number of records
+# lambda Double 2.0 Scale parameter of the Weibull distribution used for generating timestamps
+# v Double 1.5 Shape parameter of the Weibull distribution used for generating timestamps
+# p Double 0.8 1 - probability of a record being censored
+# g Int 2 If type=kaplan-meier the number of categorical features used for grouping
+# s Int 1 If type=kaplan-meier the number of categorical features used for stratifying
+# f Int 10 If type=kaplan-meier maximum number of levels (i.e., distinct values) of g+s categorical features
+# m Int 100 If type=cox the number of features in the model
+# sp Double 1.0 If type=cox the sparsity of the feature matrix
+# O String --- Location to write the output matrix containing random data for the kaplan-meier or the cox model
+# B String --- If type=cox location to write the output matrix containing the coefficients for the cox model
+# TE String --- Location to store column indices of X corresponding to timestamp (first row) and event information (second row)
+# F String --- Location to store column indices of X which are to be used for fitting the Cox model
+# fmt String "text" The output format of results of the kaplan-meier analysis, such as "text" or "csv"
+# ---------------------------------------------------------------------------------------------
+# OUTPUTS:
+# 1- If type=kaplan-meier an n x (2+g+s) matrix O with
+# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
+# - column 2 contains the information whether an event occurred (1) or data is censored (0)
+# - columns 3:2+g contain categorical features used for grouping
+# - columns 3+g:2+g+s contain categorical features used for stratifying
+# if type=cox an n x (2+m) matrix O with
+# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
+# - column 2 contains the information whether an event occurred (1) or data is censored (0)
+# - columns 3:2+m contain scale features
+# 2- If type=cox a coefficient matrix B
+# 3- A colum matrix TE containing the column indices of X corresponding to timestamp (first row) and event information (second row)
+# 4- A column matrix F containing the column indices of X which are to be used for KM analysis or fitting the Cox model
+
+type = $type; # either "kaplan-meier" or "cox"
+num_records = $n;
+lambda = ifdef ($l, 2.0);
+p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
+# parameters related to the kaplan-meier model
+n_groups = ifdef ($g, 2);
+n_strata = ifdef ($s, 1);
+max_level = ifdef ($f, 10);
+# parameters related to the cox model
+num_features = ifdef ($m, 1000);
+sparsity = ifdef ($sp, 1.0);
+fileO = $O;
+fileB = $B;
+fileTE = $TE;
+fileF = $F;
+fmtO = ifdef ($fmt, "text"); # $fmt="text"
+p_censor = 1 - p_event; # prob. that record is censored
+
+if (type == "kaplan-meier") {
+
+ v = ifdef ($v, 1.5);
+ # generate categorical features used for grouping and stratifying
+ X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
+
+ # generate timestamps
+ U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
+ T = (-log (U) / lambda) ^ (1/v);
+
+} else if (type == "cox") {
+
+ v = ifdef ($v, 50);
+ # generate feature matrix
+ X = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf = "uniform", sparsity = sparsity);
+
+ # generate coefficients
+ B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = "uniform", sparsity = 1.0); # * beta_range;
+
+ # generate timestamps
+ U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
+ T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
+
+} else {
+ stop ("Wrong model type!");
+}
+
+Y = matrix (0, rows = num_records, cols = 2);
+event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = (1 + p_event)));
+n_time = sum (event);
+Y[,2] = event;
+
+# binning of event times
+min_T = min (T);
+max_T = max (T);
+# T = T - min_T;
+len = max_T - min_T;
+num_bins = len / n_time;
+T = ceil (T / num_bins);
+
+# print ("min(T) " + min(T) + " max(T) " + max(T));
+Y[,1] = T;
+
+O = append (Y, X);
+write (O, fileO, format = fmtO);
+
+if (type == "cox") {
+ write (B, fileB, format = fmtO);
+
+}
+
+TE = matrix ("1 2", rows = 2, cols = 1);
+F = seq (1, num_features);
+write (TE, fileTE, format = fmtO);
+write (F, fileF, format = fmtO);
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4Transform.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4Transform.dml b/scripts/datagen/genRandData4Transform.dml
index b207629..bc799d6 100644
--- a/scripts/datagen/genRandData4Transform.dml
+++ b/scripts/datagen/genRandData4Transform.dml
@@ -1,96 +1,96 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Generates random data to test transform with
-#
-# rows, cols: dimensions of the data matrix to be generated
-# prob_categorical: percentage of the generated cols to be categorical
-# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols
-# prob_missing: percentage of the generated (scale) cols to have missing values
-# prob_missing_cell: probability of a cell to have a missing value
-# out_X, out_missing, out_categorical: output file names
-#
-
-#params for size of data
-num_rows = ifdef($rows, 1000)
-num_cols = ifdef($cols, 25)
-
-#params for kind of cols
-prob_categorical = ifdef($prob_cat, 0.1)
-min_domain_size = ifdef($min_domain, 1)
-max_domain_size = ifdef($max_domain, 10)
-
-#params for missing value cols
-prob_missing_col = ifdef($prob_missing, 0.1)
-prob_missing_val = ifdef($prob_missing_cell, 0.2)
-
-num_scalar_cols = as.double(num_cols)
-num_categorical_cols = 0.0
-scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
-if(prob_categorical > 0){
- categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
- categorical_ind = ppred(categorical_ind, prob_categorical, "<")
- categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows")
- num_categorical_cols = sum(categorical_ind)
- write(categorical_col_ids, $out_categorical, format="csv")
-
- domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform")
- domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes)
-
- categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform")
- categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
-
- scalar_ind = 1-categorical_ind
-}
-
-scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows")
-num_scalar_cols = sum(scalar_ind)
-scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform")
-
-if(num_categorical_cols > 0 & num_scalar_cols > 0){
- X = append(scalar_X, categorical_X)
- permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols)
- fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
- permut_mat = t(append(t(permut_mat), t(fill_in)))
- X = X %*% permut_mat
-}else{
- if(num_categorical_cols > 0) X = categorical_X
- else{
- if(num_scalar_cols > 0) X = scalar_X
- else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale")
- }
-}
-
-if(prob_missing_col > 0){
- missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
- missing_col_ind = ppred(missing_col_ind, prob_missing_col, "<")
- #currently only support missing value imputation for scale cols
- missing_col_ind = missing_col_ind * scalar_ind
- missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows")
- missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform")
- missing_values = ppred(missing_values, prob_missing_val, "<")
- X = append(X, missing_values)
-
- write(missing_col_ids, $out_missing, format="csv")
-}
-
-write(X, $out_X, format="csv")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random data to test transform with
+#
+# rows, cols: dimensions of the data matrix to be generated
+# prob_categorical: percentage of the generated cols to be categorical
+# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols
+# prob_missing: percentage of the generated (scale) cols to have missing values
+# prob_missing_cell: probability of a cell to have a missing value
+# out_X, out_missing, out_categorical: output file names
+#
+
+#params for size of data
+num_rows = ifdef($rows, 1000)
+num_cols = ifdef($cols, 25)
+
+#params for kind of cols
+prob_categorical = ifdef($prob_cat, 0.1)
+min_domain_size = ifdef($min_domain, 1)
+max_domain_size = ifdef($max_domain, 10)
+
+#params for missing value cols
+prob_missing_col = ifdef($prob_missing, 0.1)
+prob_missing_val = ifdef($prob_missing_cell, 0.2)
+
+num_scalar_cols = as.double(num_cols)
+num_categorical_cols = 0.0
+scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
+if(prob_categorical > 0){
+ categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+ categorical_ind = ppred(categorical_ind, prob_categorical, "<")
+ categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows")
+ num_categorical_cols = sum(categorical_ind)
+ write(categorical_col_ids, $out_categorical, format="csv")
+
+ domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform")
+ domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes)
+
+ categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform")
+ categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
+
+ scalar_ind = 1-categorical_ind
+}
+
+scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows")
+num_scalar_cols = sum(scalar_ind)
+scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform")
+
+if(num_categorical_cols > 0 & num_scalar_cols > 0){
+ X = append(scalar_X, categorical_X)
+ permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols)
+ fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
+ permut_mat = t(append(t(permut_mat), t(fill_in)))
+ X = X %*% permut_mat
+}else{
+ if(num_categorical_cols > 0) X = categorical_X
+ else{
+ if(num_scalar_cols > 0) X = scalar_X
+ else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale")
+ }
+}
+
+if(prob_missing_col > 0){
+ missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+ missing_col_ind = ppred(missing_col_ind, prob_missing_col, "<")
+ #currently only support missing value imputation for scale cols
+ missing_col_ind = missing_col_ind * scalar_ind
+ missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows")
+ missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform")
+ missing_values = ppred(missing_values, prob_missing_val, "<")
+ X = append(X, missing_values)
+
+ write(missing_col_ids, $out_missing, format="csv")
+}
+
+write(X, $out_X, format="csv")
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4Univariate.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4Univariate.dml b/scripts/datagen/genRandData4Univariate.dml
index d3c842c..bcbd528 100644
--- a/scripts/datagen/genRandData4Univariate.dml
+++ b/scripts/datagen/genRandData4Univariate.dml
@@ -1,61 +1,61 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random numbers from a distribution
-# with specified mean, standard deviation,
-# skewness, kurtosis
-# mean and standard deviation are taken in as
-# arguments by this script
-# a,b,c,d are coefficients computed by some
-# equation solver determined from the specified
-# skewness and kurtosis using power method
-# polynomials
-#
-# for more details see:
-# Statistical Simulation: Power Method Polynomials
-# and Other Transformations
-# Author: Todd C. Headrick
-# Chapman & Hall/CRC, Boca Raton, FL, 2010.
-# ISBN 978-1-4200-6490-2
-
-# $1 is the number of random points to be sampled
-# $2 is specified mean
-# $3 is specified standard deviation
-# $4-$7 are a,b,c,d obtained by solving a system
-# of equations using specified kurtosis and skewness
-# $8 is the file to write out the generated data to
-
-numSamples = $1
-mu = $2
-sigma = $3
-a = $4
-b = $5
-c = $6
-d = $7
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random numbers from a distribution
+# with specified mean, standard deviation,
+# skewness, kurtosis
+# mean and standard deviation are taken in as
+# arguments by this script
+# a,b,c,d are coefficients computed by some
+# equation solver determined from the specified
+# skewness and kurtosis using power method
+# polynomials
+#
+# for more details see:
+# Statistical Simulation: Power Method Polynomials
+# and Other Transformations
+# Author: Todd C. Headrick
+# Chapman & Hall/CRC, Boca Raton, FL, 2010.
+# ISBN 978-1-4200-6490-2
+
+# $1 is the number of random points to be sampled
+# $2 is specified mean
+# $3 is specified standard deviation
+# $4-$7 are a,b,c,d obtained by solving a system
+# of equations using specified kurtosis and skewness
+# $8 is the file to write out the generated data to
+
+numSamples = $1
+mu = $2
+sigma = $3
+a = $4
+b = $5
+c = $6
+d = $7
+
print("a=" + a + " b=" + b + " c=" + c + " d=" + d)
-X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
-Y = a + b*X + c*X^2 + d*X^3
-
-Z = Y*sigma + mu
-write(Z, $8, format="binary")
+X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
+Y = a + b*X + c*X^2 + d*X^3
+
+Z = Y*sigma + mu
+write(Z, $8, format="binary")
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/staging/PPCA.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/PPCA.dml b/scripts/staging/PPCA.dml
index 667c709..abfcdb1 100644
--- a/scripts/staging/PPCA.dml
+++ b/scripts/staging/PPCA.dml
@@ -1,160 +1,160 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
-# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
-# Platforms. Tarek Elgamal et.al.
-
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- location to read the matrix X input matrix
-# k Int --- indicates dimension of the new vector space constructed from eigen vectors
-# tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm
-# tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm
-# iter Int 10 maximum number of iterations
-# fmt String 'text' output format of results PPCA such as "text" or "csv"
-# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
-# ---------------------------------------------------------------------------------------------
-# OUTPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# C Matrix --- principal components
-# V Matrix --- eigenvalues / eigenvalues of principal components
-#
-
-X = read($X);
-
-fileC = $C;
-fileV = $V;
-
-k = ifdef($k, ncol(X));
-iter = ifdef($iter, 10);
-tolobj = ifdef($tolobj, 0.00001);
-tolrecerr = ifdef($tolrecerr, 0.02);
-fmt0 = ifdef($fmt, "text");
-
-n = nrow(X);
-m = ncol(X);
-
-#initializing principal components matrix
-C = rand(rows=m, cols=k, pdf="normal");
-ss = rand(rows=1, cols=1, pdf="normal");
-ss = as.scalar(ss);
-ssPrev = ss;
-
-# best selected principle components - with the lowest reconstruction error
-PC = C;
-
-# initilizing reconstruction error
-RE = tolrecerr+1;
-REBest = RE;
-
-Z = matrix(0,rows=1,cols=1);
-
-#Objective function value
-ObjRelChng = tolobj+1;
-
-# mean centered input matrix - dim -> [n,m]
-Xm = X - colMeans(X);
-
-#I -> k x k
-ITMP = matrix(1,rows=k,cols=1);
-I = diag(ITMP);
-
-i = 0;
-while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
- #Estimation step - Covariance matrix
- #M -> k x k
- M = t(C) %*% C + I*ss;
-
- #Auxilary matrix with n latent variables
- # Z -> n x k
- Z = Xm %*% (C %*% inv(M));
-
- #ZtZ -> k x k
- ZtZ = t(Z) %*% Z + inv(M)*ss;
-
- #XtZ -> m x k
- XtZ = t(Xm) %*% Z;
-
- #Maximization step
- #C -> m x k
- ZtZ_sum = sum(ZtZ); #+n*inv(M));
- C = XtZ/ZtZ_sum;
-
- #ss2 -> 1 x 1
- ss2 = trace(ZtZ * (t(C) %*% C));
-
- #ss3 -> 1 x 1
- ss3 = sum((Z %*% t(C)) %*% t(Xm));
-
- #Frobenius norm of reconstruction error -> Euclidean norm
- #Fn -> 1 x 1
- Fn = sum(Xm*Xm);
-
- #ss -> 1 x 1
- ss = (Fn + ss2 - 2*ss3)/(n*m);
-
- #calculating objective function relative change
- ObjRelChng = abs(1 - ss/ssPrev);
- #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
-
- #Reconstruction error
- R = ((Z %*% t(C)) - Xm);
-
- #calculate the error
- #TODO rethink calculation of reconstruction error ....
- #1-Norm of reconstruction error - a big dense matrix
- #RE -> n x m
- RE = abs(sum(R)/sum(Xm));
- if (RE < REBest){
- PC = C;
- REBest = RE;
- }
- #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
-
- ssPrev = ss;
- i = i+1;
-}
-print("Objective Relative Change: " + ObjRelChng);
-print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
-
-# reconstructs data
-# RD -> n x k
-RD = X %*% PC;
-
-# calculate eigenvalues - principle component variance
-RDMean = colMeans(RD);
-V = t(colMeans(RD*RD) - (RDMean*RDMean));
-
-# sorting eigenvalues and eigenvectors in decreasing order
-V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
-VF_decr = table(seq(1,nrow(V)),V_decr_idx);
-V = VF_decr %*% V;
-PC = PC %*% VF_decr;
-
-# writing principal components
-write(PC, fileC, format=fmt0);
-# writing eigen values/pc variance
-write(V, fileV, format=fmt0);
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
+# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
+# Platforms. Tarek Elgamal et.al.
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X String --- location to read the matrix X input matrix
+# k Int --- indicates dimension of the new vector space constructed from eigen vectors
+# tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm
+# tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm
+# iter Int 10 maximum number of iterations
+# fmt String 'text' output format of results PPCA such as "text" or "csv"
+# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
+# ---------------------------------------------------------------------------------------------
+# OUTPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# C Matrix --- principal components
+# V Matrix --- eigenvalues / eigenvalues of principal components
+#
+
+X = read($X);
+
+fileC = $C;
+fileV = $V;
+
+k = ifdef($k, ncol(X));
+iter = ifdef($iter, 10);
+tolobj = ifdef($tolobj, 0.00001);
+tolrecerr = ifdef($tolrecerr, 0.02);
+fmt0 = ifdef($fmt, "text");
+
+n = nrow(X);
+m = ncol(X);
+
+#initializing principal components matrix
+C = rand(rows=m, cols=k, pdf="normal");
+ss = rand(rows=1, cols=1, pdf="normal");
+ss = as.scalar(ss);
+ssPrev = ss;
+
+# best selected principle components - with the lowest reconstruction error
+PC = C;
+
+# initilizing reconstruction error
+RE = tolrecerr+1;
+REBest = RE;
+
+Z = matrix(0,rows=1,cols=1);
+
+#Objective function value
+ObjRelChng = tolobj+1;
+
+# mean centered input matrix - dim -> [n,m]
+Xm = X - colMeans(X);
+
+#I -> k x k
+ITMP = matrix(1,rows=k,cols=1);
+I = diag(ITMP);
+
+i = 0;
+while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
+ #Estimation step - Covariance matrix
+ #M -> k x k
+ M = t(C) %*% C + I*ss;
+
+ #Auxilary matrix with n latent variables
+ # Z -> n x k
+ Z = Xm %*% (C %*% inv(M));
+
+ #ZtZ -> k x k
+ ZtZ = t(Z) %*% Z + inv(M)*ss;
+
+ #XtZ -> m x k
+ XtZ = t(Xm) %*% Z;
+
+ #Maximization step
+ #C -> m x k
+ ZtZ_sum = sum(ZtZ); #+n*inv(M));
+ C = XtZ/ZtZ_sum;
+
+ #ss2 -> 1 x 1
+ ss2 = trace(ZtZ * (t(C) %*% C));
+
+ #ss3 -> 1 x 1
+ ss3 = sum((Z %*% t(C)) %*% t(Xm));
+
+ #Frobenius norm of reconstruction error -> Euclidean norm
+ #Fn -> 1 x 1
+ Fn = sum(Xm*Xm);
+
+ #ss -> 1 x 1
+ ss = (Fn + ss2 - 2*ss3)/(n*m);
+
+ #calculating objective function relative change
+ ObjRelChng = abs(1 - ss/ssPrev);
+ #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
+
+ #Reconstruction error
+ R = ((Z %*% t(C)) - Xm);
+
+ #calculate the error
+ #TODO rethink calculation of reconstruction error ....
+ #1-Norm of reconstruction error - a big dense matrix
+ #RE -> n x m
+ RE = abs(sum(R)/sum(Xm));
+ if (RE < REBest){
+ PC = C;
+ REBest = RE;
+ }
+ #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
+
+ ssPrev = ss;
+ i = i+1;
+}
+print("Objective Relative Change: " + ObjRelChng);
+print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
+
+# reconstructs data
+# RD -> n x k
+RD = X %*% PC;
+
+# calculate eigenvalues - principle component variance
+RDMean = colMeans(RD);
+V = t(colMeans(RD*RD) - (RDMean*RDMean));
+
+# sorting eigenvalues and eigenvectors in decreasing order
+V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
+VF_decr = table(seq(1,nrow(V)),V_decr_idx);
+V = VF_decr %*% V;
+PC = PC %*% VF_decr;
+
+# writing principal components
+write(PC, fileC, format=fmt0);
+# writing eigen values/pc variance
+write(V, fileV, format=fmt0);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/staging/regression/lasso/lasso.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/regression/lasso/lasso.dml b/scripts/staging/regression/lasso/lasso.dml
index fb520df..1da88d3 100644
--- a/scripts/staging/regression/lasso/lasso.dml
+++ b/scripts/staging/regression/lasso/lasso.dml
@@ -1,113 +1,113 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#uses the sparsa algorithm to perform lasso regression
-
-X = read($X)
-y = read($Y)
-n = nrow(X)
-m = ncol(X)
-
-#params
-tol = 10^(-15)
-M = 5
-tau = 1
-maxiter = 1000
-
-#constants
-eta = 2
-sigma = 0.01
-alpha_min = 10^(-30)
-alpha_max = 10^(30)
-
-#init
-alpha = 1
-w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform")
-history = -1*10^30 * matrix(1, rows=M, cols=1)
-
-r = X %*% w - y
-g = t(X) %*% r
-obj = 0.5 * sum(r*r) + tau*sum(abs(w))
-
-print("Initial OBJ=" + obj)
-
-history[M,1] = obj
-
-inactive_set = matrix(1, rows=m, cols=1)
-iter = 0
-continue = TRUE
-while(iter < maxiter & continue) {
- dw = matrix(0, rows=m, cols=1)
- dg = matrix(0, rows=m, cols=1)
- relChangeObj = -1.0
-
- inner_iter = 0
- inner_continue = TRUE
- inner_maxiter = 100
- while(inner_iter < inner_maxiter & inner_continue) {
- u = w - g/alpha
- lambda = tau/alpha
-
- wnew = sign(u) * (abs(u) - lambda) * ppred(abs(u) - lambda, 0, ">")
- dw = wnew - w
- dw2 = sum(dw*dw)
-
- r = X %*% wnew - y
- gnew = t(X) %*% r
- objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew))
- obj_threshold = max(history) - 0.5*sigma*alpha*dw2
-
- if(objnew <= obj_threshold) {
- w = wnew
- dg = gnew - g
- g = gnew
- inner_continue = FALSE
-
- history[1:(M-1),] = history[2:M,]
- history[M,1] = objnew
- relChangeObj = abs(objnew - obj)/obj
- obj = objnew
- }
- else
- alpha = eta*alpha
-
- inner_iter = inner_iter + 1
- }
-
- if(inner_continue)
- print("Inner loop did not converge")
-
- alphanew = sum(dw*dg)/sum(dw*dw)
- alpha = max(alpha_min, min(alpha_max, alphanew))
-
- old_inactive_set = inactive_set
- inactive_set = ppred(w, 0, "!=")
- diff = sum(abs(old_inactive_set - inactive_set))
-
- if(diff == 0 & relChangeObj < tol)
- continue = FALSE
-
- num_inactive = sum(ppred(w, 0, "!="))
- print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive)
- iter = iter + 1
-}
-
-write(w, $model)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#uses the sparsa algorithm to perform lasso regression
+
+X = read($X)
+y = read($Y)
+n = nrow(X)
+m = ncol(X)
+
+#params
+tol = 10^(-15)
+M = 5
+tau = 1
+maxiter = 1000
+
+#constants
+eta = 2
+sigma = 0.01
+alpha_min = 10^(-30)
+alpha_max = 10^(30)
+
+#init
+alpha = 1
+w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform")
+history = -1*10^30 * matrix(1, rows=M, cols=1)
+
+r = X %*% w - y
+g = t(X) %*% r
+obj = 0.5 * sum(r*r) + tau*sum(abs(w))
+
+print("Initial OBJ=" + obj)
+
+history[M,1] = obj
+
+inactive_set = matrix(1, rows=m, cols=1)
+iter = 0
+continue = TRUE
+while(iter < maxiter & continue) {
+ dw = matrix(0, rows=m, cols=1)
+ dg = matrix(0, rows=m, cols=1)
+ relChangeObj = -1.0
+
+ inner_iter = 0
+ inner_continue = TRUE
+ inner_maxiter = 100
+ while(inner_iter < inner_maxiter & inner_continue) {
+ u = w - g/alpha
+ lambda = tau/alpha
+
+ wnew = sign(u) * (abs(u) - lambda) * ppred(abs(u) - lambda, 0, ">")
+ dw = wnew - w
+ dw2 = sum(dw*dw)
+
+ r = X %*% wnew - y
+ gnew = t(X) %*% r
+ objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew))
+ obj_threshold = max(history) - 0.5*sigma*alpha*dw2
+
+ if(objnew <= obj_threshold) {
+ w = wnew
+ dg = gnew - g
+ g = gnew
+ inner_continue = FALSE
+
+ history[1:(M-1),] = history[2:M,]
+ history[M,1] = objnew
+ relChangeObj = abs(objnew - obj)/obj
+ obj = objnew
+ }
+ else
+ alpha = eta*alpha
+
+ inner_iter = inner_iter + 1
+ }
+
+ if(inner_continue)
+ print("Inner loop did not converge")
+
+ alphanew = sum(dw*dg)/sum(dw*dw)
+ alpha = max(alpha_min, min(alpha_max, alphanew))
+
+ old_inactive_set = inactive_set
+ inactive_set = ppred(w, 0, "!=")
+ diff = sum(abs(old_inactive_set - inactive_set))
+
+ if(diff == 0 & relChangeObj < tol)
+ continue = FALSE
+
+ num_inactive = sum(ppred(w, 0, "!="))
+ print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive)
+ iter = iter + 1
+}
+
+write(w, $model)