You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:15 UTC
[39/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4LogReg_LTstats.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4LogReg_LTstats.dml b/scripts/datagen/genRandData4LogReg_LTstats.dml
index 6742f0c..2ec5aef 100644
--- a/scripts/datagen/genRandData4LogReg_LTstats.dml
+++ b/scripts/datagen/genRandData4LogReg_LTstats.dml
@@ -1,233 +1,233 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# generates random data to test bi- and multinomial logistic regression
-
-# $N  = number of training samples
-# $Nt = number of test samples (or 0 if none)
-# $nf = number of features (independent variables)
-# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
-# $Xmin  = minimum feature value
-# $Xmax  = maximum feature value
-# $spars = controls sparsity in the generated data
-# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
-# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
-# $stdLT = requested standard deviation for the linear terms
-# $iceptmin = intercept, minimum value (0.0 disables intercept)
-# $iceptmax = intercept, maximum value (0.0 disables intercept)
-# $B  = location to store generated regression parameters
-# $X  = location to store generated training data
-# $Y  = location to store generated training category labels
-# $Xt = location to store generated test data
-# $Yt = location to store generated test category labels
-#
-# Example:
-# hadoop jar SystemML.jar -f genRandData4LogReg_LTstats.dml -nvargs
-#     N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25
-#     iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123
-
-numTrainingSamples = $N;
-numTestSamples = $Nt;
-numFeatures = $nf;
-numCategories = $nc;
-minIntercept = $iceptmin;
-maxIntercept = $iceptmax;
-minXentry = $Xmin;
-maxXentry = $Xmax;
-minAvgLT = $avgLTmin;
-maxAvgLT = $avgLTmax;
-sparsityLevel = $spars;
-stdevLT = $stdLT;
-fileB  = ifdef ($B,  "B");
-fileX  = ifdef ($X,  "X");
-fileY  = ifdef ($Y,  "Y");
-fileXt = ifdef ($Xt, "Xt");
-fileYt = ifdef ($Yt, "Yt");
-
-
-numSamples = numTrainingSamples + numTestSamples;
-
-isBinomialPMOne = FALSE;
-if (numCategories == 1) {
-    numCategories = 2;
-    isBinomialPMOne = TRUE;
-}
-do_we_output_intercept = 1;
-if (minIntercept == 0.0 & maxIntercept == 0.0) {
-    do_we_output_intercept = 0;
-}
-
-X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel);
-
-meanLT  = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform");
-sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
-b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform");
-
-meanLT_minus_intercept = meanLT - b_intercept;
-[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
-
-ones = matrix (1.0, rows = numSamples, cols = 1);
-LT = X %*% B + ones %*% b_intercept;
-actual_meanLT  = colSums (LT) / numSamples;
-actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
-
-for (i in 1:(numCategories - 1)) {
-    if (castAsScalar (new_sigmaLT [1, i]) == castAsScalar (sigmaLT [1, i])) {
-        print ("Category " + i + ":  Intercept = " + castAsScalar (b_intercept [1, i])); 
-    } else {
-        print ("Category " + i + ":  Intercept = " + castAsScalar (b_intercept [1, i]) + ",  st.dev.(LT) relaxed from " + castAsScalar (sigmaLT [1, i])); 
-    }
-    print ("    Wanted LT mean = " + castAsScalar (meanLT [1, i])        + ",  st.dev. = " + castAsScalar (new_sigmaLT [1, i]));
-    print ("    Actual LT mean = " + castAsScalar (actual_meanLT [1, i]) + ",  st.dev. = " + castAsScalar (actual_sigmaLT [1, i]));
-}
-
-
-ones = matrix (1.0, rows = 1, cols = numCategories - 1);
-Prob = exp (LT);
-Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0);
-R = r %*% ones;
-Y = 1 + rowSums (ppred (Prob, R, "<"));
-if (isBinomialPMOne) {
-    Y = 3 - 2 * Y;
-}
-
-
-/* USE FOR LINEAR REGRESSION
-
-r = Rand (rows = numSamples, cols = 1, pdf = "normal");
-Y = LT [, 1] + r;
-
-*/
-
-
-if (do_we_output_intercept == 1) {
-    new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
-    new_B [1:nrow(B), 1:ncol(B)] = B;
-    new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
-    write (new_B, fileB, format="mm");
-} else {
-    write (B, fileB, format="mm");
-}
-
-if (numTestSamples > 0) {
-    X_train = X [1:numTrainingSamples,];
-    Y_train = Y [1:numTrainingSamples,];
-    X_test  = X [(numTrainingSamples+1):numSamples,];
-    Y_test  = Y [(numTrainingSamples+1):numSamples,];
-    write (X_train, fileX,  format="mm");
-    write (Y_train, fileY,  format="mm");
-    write (X_test,  fileXt, format="mm");
-    write (Y_test,  fileYt, format="mm");
-} else {
-    write (X, fileX, format="mm");
-    write (Y, fileY, format="mm");
-}
-
-
-
-
-
-
-# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W
-# To be used for data generation in the testing of GLM, Logistic Regression, etc.
-# INPUT:  meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are
-#         the desired mean and standard deviation for X %*% W[, i]
-# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
-#         new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced,
-#         new_sigmaLT[1, i]  > sigmaLT[1, i] if we had to relax this constraint.
-generateWeights = 
-    function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
-    return   (Matrix[double] W, Matrix[double] new_sigmaLT)
-{
-    num_w = ncol (meanLT);  # Number of output weight vectors
-    dim_w = ncol (X);       # Number of features / dimensions in a weight vector
-    w_X = t(colSums(X));    # "Prohibited" weight shift direction that changes meanLT
-                            # (all orthogonal shift directions do not affect meanLT)
-
-    # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
-
-    w_1 = straightenX (X);
-    r_1 = (X %*% w_1) - 1.0;
-    norm_r_1_sq = sum (r_1 ^ 2);
-    
-    # For each W[, i] generate uniformly random directions to shift away from "w_1"
-    
-    DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
-    DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X
-    XDW = X %*% DW;
-    
-    # Determine how far to shift in the chosen directions to satisfy the constraints
-    # Use the positive root of the quadratic equation; relax sigmaLT where needed
-    
-    a_qe = colSums (XDW ^ 2);
-    b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
-    c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
-
-    is_sigmaLT_OK = ppred (c_qe, 0.0, "<=");
-    new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X));
-    c_qe = is_sigmaLT_OK * c_qe;
-    x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
-    
-    # Scale and shift "w_1" in the "DW" directions to produce the result:
-    
-    ones = matrix (1.0, rows = dim_w, cols = 1);
-    W = w_1 %*% meanLT + DW * (ones %*% x_qe);
-}
-
-# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
-straightenX =
-    function (Matrix[double] X)
-    return   (Matrix[double] w)
-{
-    w_X = t(colSums(X));
-    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
-    eps = 0.000000001 * nrow(X);
-
-    # BEGIN LEAST SQUARES
-    
-    r_LS = - w_X;
-    z_LS = matrix (0.0, rows = ncol(X), cols = 1);
-    p_LS = - r_LS;
-    norm_r2_LS = sum (r_LS ^ 2);
-    i_LS = 0;
-    while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
-    {
-        temp_LS = X %*% p_LS;
-        q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
-        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
-        z_LS = z_LS + alpha_LS * p_LS;
-        old_norm_r2_LS = norm_r2_LS;
-        r_LS = r_LS + alpha_LS * q_LS;
-        norm_r2_LS = sum (r_LS ^ 2);
-        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
-        i_LS = i_LS + 1;
-    }
-    
-    # END LEAST SQUARES
-    
-    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# generates random data to test bi- and multinomial logistic regression
+
+# $N  = number of training samples
+# $Nt = number of test samples (or 0 if none)
+# $nf = number of features (independent variables)
+# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
+# $Xmin  = minimum feature value
+# $Xmax  = maximum feature value
+# $spars = controls sparsity in the generated data
+# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
+# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
+# $stdLT = requested standard deviation for the linear terms
+# $iceptmin = intercept, minimum value (0.0 disables intercept)
+# $iceptmax = intercept, maximum value (0.0 disables intercept)
+# $B  = location to store generated regression parameters
+# $X  = location to store generated training data
+# $Y  = location to store generated training category labels
+# $Xt = location to store generated test data
+# $Yt = location to store generated test category labels
+#
+# Example:
+# hadoop jar SystemML.jar -f genRandData4LogReg_LTstats.dml -nvargs
+#     N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25
+#     iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123
+
+numTrainingSamples = $N;
+numTestSamples = $Nt;
+numFeatures = $nf;
+numCategories = $nc;
+minIntercept = $iceptmin;
+maxIntercept = $iceptmax;
+minXentry = $Xmin;
+maxXentry = $Xmax;
+minAvgLT = $avgLTmin;
+maxAvgLT = $avgLTmax;
+sparsityLevel = $spars;
+stdevLT = $stdLT;
+fileB  = ifdef ($B,  "B");
+fileX  = ifdef ($X,  "X");
+fileY  = ifdef ($Y,  "Y");
+fileXt = ifdef ($Xt, "Xt");
+fileYt = ifdef ($Yt, "Yt");
+
+
+numSamples = numTrainingSamples + numTestSamples;
+
+isBinomialPMOne = FALSE;
+if (numCategories == 1) {
+    numCategories = 2;
+    isBinomialPMOne = TRUE;
+}
+do_we_output_intercept = 1;
+if (minIntercept == 0.0 & maxIntercept == 0.0) {
+    do_we_output_intercept = 0;
+}
+
+X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel);
+
+meanLT  = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform");
+sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
+b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform");
+
+meanLT_minus_intercept = meanLT - b_intercept;
+[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
+
+ones = matrix (1.0, rows = numSamples, cols = 1);
+LT = X %*% B + ones %*% b_intercept;
+actual_meanLT  = colSums (LT) / numSamples;
+actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
+
+for (i in 1:(numCategories - 1)) {
+    if (castAsScalar (new_sigmaLT [1, i]) == castAsScalar (sigmaLT [1, i])) {
+        print ("Category " + i + ":  Intercept = " + castAsScalar (b_intercept [1, i])); 
+    } else {
+        print ("Category " + i + ":  Intercept = " + castAsScalar (b_intercept [1, i]) + ",  st.dev.(LT) relaxed from " + castAsScalar (sigmaLT [1, i])); 
+    }
+    print ("    Wanted LT mean = " + castAsScalar (meanLT [1, i])        + ",  st.dev. = " + castAsScalar (new_sigmaLT [1, i]));
+    print ("    Actual LT mean = " + castAsScalar (actual_meanLT [1, i]) + ",  st.dev. = " + castAsScalar (actual_sigmaLT [1, i]));
+}
+
+
+ones = matrix (1.0, rows = 1, cols = numCategories - 1);
+Prob = exp (LT);
+Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0);
+R = r %*% ones;
+Y = 1 + rowSums (ppred (Prob, R, "<"));
+if (isBinomialPMOne) {
+    Y = 3 - 2 * Y;
+}
+
+
+/* USE FOR LINEAR REGRESSION
+
+r = Rand (rows = numSamples, cols = 1, pdf = "normal");
+Y = LT [, 1] + r;
+
+*/
+
+
+if (do_we_output_intercept == 1) {
+    new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
+    new_B [1:nrow(B), 1:ncol(B)] = B;
+    new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
+    write (new_B, fileB, format="mm");
+} else {
+    write (B, fileB, format="mm");
+}
+
+if (numTestSamples > 0) {
+    X_train = X [1:numTrainingSamples,];
+    Y_train = Y [1:numTrainingSamples,];
+    X_test  = X [(numTrainingSamples+1):numSamples,];
+    Y_test  = Y [(numTrainingSamples+1):numSamples,];
+    write (X_train, fileX,  format="mm");
+    write (Y_train, fileY,  format="mm");
+    write (X_test,  fileXt, format="mm");
+    write (Y_test,  fileYt, format="mm");
+} else {
+    write (X, fileX, format="mm");
+    write (Y, fileY, format="mm");
+}
+
+
+
+
+
+
+# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W
+# To be used for data generation in the testing of GLM, Logistic Regression, etc.
+# INPUT:  meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are
+#         the desired mean and standard deviation for X %*% W[, i]
+# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
+#         new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced,
+#         new_sigmaLT[1, i]  > sigmaLT[1, i] if we had to relax this constraint.
+generateWeights = 
+    function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
+    return   (Matrix[double] W, Matrix[double] new_sigmaLT)
+{
+    num_w = ncol (meanLT);  # Number of output weight vectors
+    dim_w = ncol (X);       # Number of features / dimensions in a weight vector
+    w_X = t(colSums(X));    # "Prohibited" weight shift direction that changes meanLT
+                            # (all orthogonal shift directions do not affect meanLT)
+
+    # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
+
+    w_1 = straightenX (X);
+    r_1 = (X %*% w_1) - 1.0;
+    norm_r_1_sq = sum (r_1 ^ 2);
+    
+    # For each W[, i] generate uniformly random directions to shift away from "w_1"
+    
+    DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
+    DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X
+    XDW = X %*% DW;
+    
+    # Determine how far to shift in the chosen directions to satisfy the constraints
+    # Use the positive root of the quadratic equation; relax sigmaLT where needed
+    
+    a_qe = colSums (XDW ^ 2);
+    b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
+    c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
+
+    is_sigmaLT_OK = ppred (c_qe, 0.0, "<=");
+    new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X));
+    c_qe = is_sigmaLT_OK * c_qe;
+    x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
+    
+    # Scale and shift "w_1" in the "DW" directions to produce the result:
+    
+    ones = matrix (1.0, rows = dim_w, cols = 1);
+    W = w_1 %*% meanLT + DW * (ones %*% x_qe);
+}
+
+# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
+straightenX =
+    function (Matrix[double] X)
+    return   (Matrix[double] w)
+{
+    w_X = t(colSums(X));
+    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+    eps = 0.000000001 * nrow(X);
+
+    # BEGIN LEAST SQUARES
+    
+    r_LS = - w_X;
+    z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+    p_LS = - r_LS;
+    norm_r2_LS = sum (r_LS ^ 2);
+    i_LS = 0;
+    while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
+    {
+        temp_LS = X %*% p_LS;
+        q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
+        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+        z_LS = z_LS + alpha_LS * p_LS;
+        old_norm_r2_LS = norm_r2_LS;
+        r_LS = r_LS + alpha_LS * q_LS;
+        norm_r2_LS = sum (r_LS ^ 2);
+        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+        i_LS = i_LS + 1;
+    }
+    
+    # END LEAST SQUARES
+    
+    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4MultiClassSVM.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4MultiClassSVM.dml b/scripts/datagen/genRandData4MultiClassSVM.dml
index 65ee1d4..5d9fbcb 100644
--- a/scripts/datagen/genRandData4MultiClassSVM.dml
+++ b/scripts/datagen/genRandData4MultiClassSVM.dml
@@ -1,68 +1,68 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data to test linear logistic regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10)
-X = X * maxFeatureValue 
-
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-w = w * maxWeight
-
-ot = X%*%w
-if(b!=0) {
-	b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
-	w =  t(append(t(w), b_mat))
-	ot = ot + b
-}
-
-prob = 1/(1+exp(-ot))
-if(addNoise == 1){
-	r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-}else{
-	print("this data generator generates the same dataset for both noise=0 and noise=1")
-	r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-	#r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
-}
-Y = 1 - 2*ppred(prob, r, "<")
-Y = (Y+3)/2
-
-write(w, $5, format="binary")
-write(X, $6, format="binary")
-write(Y, $7, format="binary")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data to test linear logistic regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10)
+X = X * maxFeatureValue 
+
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+w = w * maxWeight
+
+ot = X%*%w
+if(b!=0) {
+	b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+	w =  t(append(t(w), b_mat))
+	ot = ot + b
+}
+
+prob = 1/(1+exp(-ot))
+if(addNoise == 1){
+	r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+}else{
+	print("this data generator generates the same dataset for both noise=0 and noise=1")
+	r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+	#r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
+}
+Y = 1 - 2*ppred(prob, r, "<")
+Y = (Y+3)/2
+
+write(w, $5, format="binary")
+write(X, $6, format="binary")
+write(Y, $7, format="binary")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4NMF.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4NMF.dml b/scripts/datagen/genRandData4NMF.dml
index 5988d48..cf18430 100644
--- a/scripts/datagen/genRandData4NMF.dml
+++ b/scripts/datagen/genRandData4NMF.dml
@@ -1,129 +1,129 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-# 	 (may overlap). use this to vary
-#	 sparsity.	
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = ppred(denomsTM, 0, "==")
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
-	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
-	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
-	topicDist = topicDistributions[i,]
-	
-	denom2 = sum(topicDist)
-	if(denom2 == 0){
-		denom2 = denom2 + 0.1
-	}
-	
-	topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
-	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
-}
-
-data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-parfor(i in 1:numDocuments){
-	docTopic = docTopicMixtures[i,]
-	
-    ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
-  
-	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
-	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
-	
-	for(j in 1:numWordsPerDoc){
-		rz = castAsScalar(r_z[j,1])
-		continue = 1
-		
-		z = -1
-		#this is a workaround
-		#z=1	
-		
-		for(k1 in 1:numTopics){
-			prob = castAsScalar(docTopic[1,k1])
-			if(continue==1 & rz <= prob){
-				z=k1
-				continue=0
-			}
-		}
-		
-		if(z==-1){
-			print("z is unassigned: " + z)
-			z = numTopics
-		}
-		
-		rw = castAsScalar(r_w[j,1])
-		continue = 1
-		
-		w = -1
-		#this is a workaround
-		#w = 1
-		
-		for(k2 in 1:numFeatures){
-			prob = castAsScalar(topicDistributions[z,k2])
-			if(continue == 1 & rw <= prob){
-				w = k2
-				continue = 0
-			}
-		}
-		
-		if(w==-1){
-			print("w is unassigned: " + w)
-			w = numFeatures
-		}
-		
-		ldata[1,w] = ldata[1,w] + 1
-	}
-  
-    data[i,] = ldata;
-}
-
-write(data, $7, format="binary")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+# 	 (may overlap). use this to vary
+#	 sparsity.	
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = ppred(denomsTM, 0, "==")
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+	topicDist = topicDistributions[i,]
+	
+	denom2 = sum(topicDist)
+	if(denom2 == 0){
+		denom2 = denom2 + 0.1
+	}
+	
+	topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
+}
+
+data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+parfor(i in 1:numDocuments){
+	docTopic = docTopicMixtures[i,]
+	
+    ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
+  
+	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+	
+	for(j in 1:numWordsPerDoc){
+		rz = castAsScalar(r_z[j,1])
+		continue = 1
+		
+		z = -1
+		#this is a workaround
+		#z=1	
+		
+		for(k1 in 1:numTopics){
+			prob = castAsScalar(docTopic[1,k1])
+			if(continue==1 & rz <= prob){
+				z=k1
+				continue=0
+			}
+		}
+		
+		if(z==-1){
+			print("z is unassigned: " + z)
+			z = numTopics
+		}
+		
+		rw = castAsScalar(r_w[j,1])
+		continue = 1
+		
+		w = -1
+		#this is a workaround
+		#w = 1
+		
+		for(k2 in 1:numFeatures){
+			prob = castAsScalar(topicDistributions[z,k2])
+			if(continue == 1 & rw <= prob){
+				w = k2
+				continue = 0
+			}
+		}
+		
+		if(w==-1){
+			print("w is unassigned: " + w)
+			w = numFeatures
+		}
+		
+		ldata[1,w] = ldata[1,w] + 1
+	}
+  
+    data[i,] = ldata;
+}
+
+write(data, $7, format="binary")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4NMFBlockwise.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4NMFBlockwise.dml b/scripts/datagen/genRandData4NMFBlockwise.dml
index 63133da..e3fd67f 100644
--- a/scripts/datagen/genRandData4NMFBlockwise.dml
+++ b/scripts/datagen/genRandData4NMFBlockwise.dml
@@ -1,138 +1,138 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-# 	 (may overlap). use this to vary
-#	 sparsity.	
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-#
-# $8 is the blocksize, i.e., number of rows per block
-#    (should be set such that $8x$2 fits in mem budget)
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-blocksize = $8
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = ppred(denomsTM, 0, "==")
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
-	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
-	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
-	topicDist = topicDistributions[i,]
-	
-	denom2 = sum(topicDist)
-	if(denom2 == 0){
-		denom2 = denom2 + 0.1
-	}
-	
-	topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
-	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
-}
-
-data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-#outer-loop for blockwise computation
-for( k in seq(1,numDocuments,blocksize) )  
-{
-  len = min(blocksize,numDocuments-k); #block length
-  data = data0[k:(k+len),];            #obtain block
-  
-  parfor(i in 1:len){
-  	docTopic = docTopicMixtures[i,]
-  	
-  	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
-  	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
-  	
-  	for(j in 1:numWordsPerDoc){
-  		rz = castAsScalar(r_z[j,1])
-  		continue = 1
-  		
-  		z = -1
-  		#this is a workaround
-  		#z=1	
-  		
-  		for(k1 in 1:numTopics){
-  			prob = castAsScalar(docTopic[1,k1])
-  			if(continue==1 & rz <= prob){
-  				z=k1
-  				continue=0
-  			}
-  		}
-  		
-  		if(z==-1){
-  			print("z is unassigned: " + z)
-  			z = numTopics
-  		}
-  		
-  		rw = castAsScalar(r_w[j,1])
-  		continue = 1
-  		
-  		w = -1
-  		#this is a workaround
-  		#w = 1
-  		
-  		for(k2 in 1:numFeatures){
-  			prob = castAsScalar(topicDistributions[z,k2])
-  			if(continue == 1 & rw <= prob){
-  				w = k2
-  				continue = 0
-  			}
-  		}
-  		
-  		if(w==-1){
-  			print("w is unassigned: " + w)
-  			w = numFeatures
-  		}
-  		
-  		data[i,w] = data[i,w] + 1
-  	}
-  }
-  
-  data0[k:(k+len),] = data; # write block back
-}
-
-write(data0, $7, format="binary")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+# 	 (may overlap). use this to vary
+#	 sparsity.	
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+#
+# $8 is the blocksize, i.e., number of rows per block
+#    (should be set such that $8x$2 fits in mem budget)
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+blocksize = $8
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = ppred(denomsTM, 0, "==")
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+	topicDist = topicDistributions[i,]
+	
+	denom2 = sum(topicDist)
+	if(denom2 == 0){
+		denom2 = denom2 + 0.1
+	}
+	
+	topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
+}
+
+data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+#outer-loop for blockwise computation
+for( k in seq(1,numDocuments,blocksize) )  
+{
+  len = min(blocksize,numDocuments-k); #block length
+  data = data0[k:(k+len),];            #obtain block
+  
+  parfor(i in 1:len){
+  	docTopic = docTopicMixtures[i,]
+  	
+  	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+  	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
+  	
+  	for(j in 1:numWordsPerDoc){
+  		rz = castAsScalar(r_z[j,1])
+  		continue = 1
+  		
+  		z = -1
+  		#this is a workaround
+  		#z=1	
+  		
+  		for(k1 in 1:numTopics){
+  			prob = castAsScalar(docTopic[1,k1])
+  			if(continue==1 & rz <= prob){
+  				z=k1
+  				continue=0
+  			}
+  		}
+  		
+  		if(z==-1){
+  			print("z is unassigned: " + z)
+  			z = numTopics
+  		}
+  		
+  		rw = castAsScalar(r_w[j,1])
+  		continue = 1
+  		
+  		w = -1
+  		#this is a workaround
+  		#w = 1
+  		
+  		for(k2 in 1:numFeatures){
+  			prob = castAsScalar(topicDistributions[z,k2])
+  			if(continue == 1 & rw <= prob){
+  				w = k2
+  				continue = 0
+  			}
+  		}
+  		
+  		if(w==-1){
+  			print("w is unassigned: " + w)
+  			w = numFeatures
+  		}
+  		
+  		data[i,w] = data[i,w] + 1
+  	}
+  }
+  
+  data0[k:(k+len),] = data; # write block back
+}
+
+write(data0, $7, format="binary")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4SurvAnalysis.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4SurvAnalysis.dml b/scripts/datagen/genRandData4SurvAnalysis.dml
index 7ac235b..da94a22 100644
--- a/scripts/datagen/genRandData4SurvAnalysis.dml
+++ b/scripts/datagen/genRandData4SurvAnalysis.dml
@@ -1,133 +1,133 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL HAZARD MODELS
-# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTIRUTION WITH PARAMETERS LAMBDA AND V 
-#
-# INPUT   PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME    TYPE     DEFAULT      MEANING
-# ---------------------------------------------------------------------------------------------
-# type    Sting    ---          The type of model for which the data is being generated: "kaplan-meier" or "cox"
-# n       Int                   Number of records 
-# lambda  Double   2.0          Scale parameter of the Weibull distribution used for generating timestamps 
-# v       Double   1.5          Shape parameter of the Weibull distribution used for generating timestamps 
-# p       Double   0.8          1 - probability of a record being censored
-# g       Int      2            If type=kaplan-meier the number of categorical features used for grouping 
-# s       Int      1            If type=kaplan-meier the number of categorical features used for stratifying
-# f       Int      10           If type=kaplan-meier maximum number of levels (i.e., distinct values) of g+s categorical features
-# m       Int      100          If type=cox the number of features in the model
-# sp      Double   1.0          If type=cox the sparsity of the feature matrix 
-# O       String   ---          Location to write the output matrix containing random data for the kaplan-meier or the cox model 
-# B       String   ---          If type=cox location to write the output matrix containing the coefficients for the cox model 
-# TE 	  String   ---			Location to store column indices of X corresponding to timestamp (first row) and event information (second row)
-# F       String   ---			Location to store column indices of X which are to be used for fitting the Cox model
-# fmt     String   "text"       The output format of results of the kaplan-meier analysis, such as "text" or "csv"
-# ---------------------------------------------------------------------------------------------
-# OUTPUTS: 
-# 1- If type=kaplan-meier an n x (2+g+s) matrix O with      
-#    - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
-#	 - column 2 contains the information whether an event occurred (1) or data is censored (0)
-#	 - columns 3:2+g contain categorical features used for grouping	
-#    - columns 3+g:2+g+s contain categorical features used for stratifying
-#   if type=cox an n x (2+m) matrix O with 
-#	 - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
-#	 - column 2 contains the information whether an event occurred (1) or data is censored (0)
-#	 - columns 3:2+m contain scale features 
-# 2- If type=cox a coefficient matrix B
-# 3- A colum matrix TE containing the column indices of X corresponding to timestamp (first row) and event information (second row) 
-# 4- A column matrix F containing the column indices of X which are to be used for KM analysis or fitting the Cox model
-
-type = $type; # either "kaplan-meier" or "cox" 
-num_records = $n; 
-lambda = ifdef ($l, 2.0); 
-p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
-# parameters related to the kaplan-meier model
-n_groups = ifdef ($g, 2);
-n_strata = ifdef ($s, 1);
-max_level = ifdef ($f, 10);
-# parameters related to the cox model
-num_features = ifdef ($m, 1000);  
-sparsity = ifdef ($sp, 1.0); 
-fileO = $O;
-fileB = $B; 
-fileTE = $TE;
-fileF = $F;
-fmtO = ifdef ($fmt, "text"); # $fmt="text" 
-p_censor = 1 - p_event; # prob. that record is censored
-
-if (type == "kaplan-meier") {
-	
-	v = ifdef ($v, 1.5);
-	# generate categorical features used for grouping and stratifying
-	X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
-	
-	# generate timestamps
-	U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
-	T = (-log (U) / lambda) ^ (1/v);
-
-} else if (type == "cox") {
-
-	v = ifdef ($v, 50);
-	# generate feature matrix
-	X = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf = "uniform", sparsity = sparsity);
-
-	# generate coefficients
-	B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = "uniform", sparsity = 1.0); # * beta_range;	
-
-	# generate timestamps
-	U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
-	T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
-
-} else {
-	stop ("Wrong model type!");
-}
-
-Y = matrix (0, rows = num_records, cols = 2);
-event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = (1 + p_event)));
-n_time = sum (event);
-Y[,2] = event;
-  	
-# binning of event times
-min_T = min (T);
-max_T = max (T);
-# T = T - min_T;
-len = max_T - min_T;
-num_bins = len / n_time;
-T = ceil (T / num_bins);
-
-# print ("min(T) " + min(T) + " max(T) " + max(T));
-Y[,1] = T;
-
-O = append (Y, X);
-write (O, fileO, format = fmtO);
-
-if (type == "cox") {
-	write (B, fileB, format = fmtO);
-	
-}
-
-TE = matrix ("1 2", rows = 2, cols = 1);
-F = seq (1, num_features);
-write (TE, fileTE, format = fmtO);
-write (F, fileF, format = fmtO);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL HAZARD MODELS
+# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTIRUTION WITH PARAMETERS LAMBDA AND V 
+#
+# INPUT   PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME    TYPE     DEFAULT      MEANING
+# ---------------------------------------------------------------------------------------------
+# type    Sting    ---          The type of model for which the data is being generated: "kaplan-meier" or "cox"
+# n       Int                   Number of records 
+# lambda  Double   2.0          Scale parameter of the Weibull distribution used for generating timestamps 
+# v       Double   1.5          Shape parameter of the Weibull distribution used for generating timestamps 
+# p       Double   0.8          1 - probability of a record being censored
+# g       Int      2            If type=kaplan-meier the number of categorical features used for grouping 
+# s       Int      1            If type=kaplan-meier the number of categorical features used for stratifying
+# f       Int      10           If type=kaplan-meier maximum number of levels (i.e., distinct values) of g+s categorical features
+# m       Int      100          If type=cox the number of features in the model
+# sp      Double   1.0          If type=cox the sparsity of the feature matrix 
+# O       String   ---          Location to write the output matrix containing random data for the kaplan-meier or the cox model 
+# B       String   ---          If type=cox location to write the output matrix containing the coefficients for the cox model 
+# TE 	  String   ---			Location to store column indices of X corresponding to timestamp (first row) and event information (second row)
+# F       String   ---			Location to store column indices of X which are to be used for fitting the Cox model
+# fmt     String   "text"       The output format of results of the kaplan-meier analysis, such as "text" or "csv"
+# ---------------------------------------------------------------------------------------------
+# OUTPUTS: 
+# 1- If type=kaplan-meier an n x (2+g+s) matrix O with      
+#    - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
+#	 - column 2 contains the information whether an event occurred (1) or data is censored (0)
+#	 - columns 3:2+g contain categorical features used for grouping	
+#    - columns 3+g:2+g+s contain categorical features used for stratifying
+#   if type=cox an n x (2+m) matrix O with 
+#	 - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v
+#	 - column 2 contains the information whether an event occurred (1) or data is censored (0)
+#	 - columns 3:2+m contain scale features 
+# 2- If type=cox a coefficient matrix B
+# 3- A colum matrix TE containing the column indices of X corresponding to timestamp (first row) and event information (second row) 
+# 4- A column matrix F containing the column indices of X which are to be used for KM analysis or fitting the Cox model
+
+type = $type; # either "kaplan-meier" or "cox" 
+num_records = $n; 
+lambda = ifdef ($l, 2.0); 
+p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
+# parameters related to the kaplan-meier model
+n_groups = ifdef ($g, 2);
+n_strata = ifdef ($s, 1);
+max_level = ifdef ($f, 10);
+# parameters related to the cox model
+num_features = ifdef ($m, 1000);  
+sparsity = ifdef ($sp, 1.0); 
+fileO = $O;
+fileB = $B; 
+fileTE = $TE;
+fileF = $F;
+fmtO = ifdef ($fmt, "text"); # $fmt="text" 
+p_censor = 1 - p_event; # prob. that record is censored
+
+if (type == "kaplan-meier") {
+	
+	v = ifdef ($v, 1.5);
+	# generate categorical features used for grouping and stratifying
+	X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
+	
+	# generate timestamps
+	U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
+	T = (-log (U) / lambda) ^ (1/v);
+
+} else if (type == "cox") {
+
+	v = ifdef ($v, 50);
+	# generate feature matrix
+	X = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf = "uniform", sparsity = sparsity);
+
+	# generate coefficients
+	B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = "uniform", sparsity = 1.0); # * beta_range;	
+
+	# generate timestamps
+	U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
+	T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
+
+} else {
+	stop ("Wrong model type!");
+}
+
+Y = matrix (0, rows = num_records, cols = 2);
+event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = (1 + p_event)));
+n_time = sum (event);
+Y[,2] = event;
+  	
+# binning of event times
+min_T = min (T);
+max_T = max (T);
+# T = T - min_T;
+len = max_T - min_T;
+num_bins = len / n_time;
+T = ceil (T / num_bins);
+
+# print ("min(T) " + min(T) + " max(T) " + max(T));
+Y[,1] = T;
+
+O = append (Y, X);
+write (O, fileO, format = fmtO);
+
+if (type == "cox") {
+	write (B, fileB, format = fmtO);
+	
+}
+
+TE = matrix ("1 2", rows = 2, cols = 1);
+F = seq (1, num_features);
+write (TE, fileTE, format = fmtO);
+write (F, fileF, format = fmtO);
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4Transform.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4Transform.dml b/scripts/datagen/genRandData4Transform.dml
index b207629..bc799d6 100644
--- a/scripts/datagen/genRandData4Transform.dml
+++ b/scripts/datagen/genRandData4Transform.dml
@@ -1,96 +1,96 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Generates random data to test transform with
-#
-# rows, cols: dimensions of the data matrix to be generated
-# prob_categorical: percentage of the generated cols to be categorical
-# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols
-# prob_missing: percentage of the generated (scale) cols to have missing values
-# prob_missing_cell: probability of a cell to have a missing value
-# out_X, out_missing, out_categorical: output file names
-#
-
-#params for size of data
-num_rows = ifdef($rows, 1000)
-num_cols = ifdef($cols, 25)
-
-#params for kind of cols
-prob_categorical = ifdef($prob_cat, 0.1)
-min_domain_size = ifdef($min_domain, 1)
-max_domain_size = ifdef($max_domain, 10)
-
-#params for missing value cols
-prob_missing_col = ifdef($prob_missing, 0.1)
-prob_missing_val = ifdef($prob_missing_cell, 0.2)
-
-num_scalar_cols = as.double(num_cols)
-num_categorical_cols = 0.0
-scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
-if(prob_categorical > 0){
-  categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
-  categorical_ind = ppred(categorical_ind, prob_categorical, "<")
-  categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows")
-  num_categorical_cols = sum(categorical_ind)
-  write(categorical_col_ids, $out_categorical, format="csv")
-  
-  domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform")
-  domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes)
-  
-  categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform")
-  categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
-
-  scalar_ind = 1-categorical_ind
-}
-
-scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows")
-num_scalar_cols = sum(scalar_ind)
-scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform")
-  
-if(num_categorical_cols > 0 & num_scalar_cols > 0){
-  X = append(scalar_X, categorical_X)
-  permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols)
-  fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
-  permut_mat = t(append(t(permut_mat), t(fill_in)))
-  X = X %*% permut_mat
-}else{
-  if(num_categorical_cols > 0) X = categorical_X
-  else{
-    if(num_scalar_cols > 0) X = scalar_X
-    else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale")
-  }
-}
-
-if(prob_missing_col > 0){
-  missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
-  missing_col_ind = ppred(missing_col_ind, prob_missing_col, "<")
-  #currently only support missing value imputation for scale cols
-  missing_col_ind = missing_col_ind * scalar_ind
-  missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows")
-  missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform")
-  missing_values = ppred(missing_values, prob_missing_val, "<")
-  X = append(X, missing_values)
-  
-  write(missing_col_ids, $out_missing, format="csv")
-}
-
-write(X, $out_X, format="csv")
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random data to test transform with
+#
+# rows, cols: dimensions of the data matrix to be generated
+# prob_categorical: percentage of the generated cols to be categorical
+# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols
+# prob_missing: percentage of the generated (scale) cols to have missing values
+# prob_missing_cell: probability of a cell to have a missing value
+# out_X, out_missing, out_categorical: output file names
+#
+
+#params for size of data
+num_rows = ifdef($rows, 1000)
+num_cols = ifdef($cols, 25)
+
+#params for kind of cols
+prob_categorical = ifdef($prob_cat, 0.1)
+min_domain_size = ifdef($min_domain, 1)
+max_domain_size = ifdef($max_domain, 10)
+
+#params for missing value cols
+prob_missing_col = ifdef($prob_missing, 0.1)
+prob_missing_val = ifdef($prob_missing_cell, 0.2)
+
+num_scalar_cols = as.double(num_cols)
+num_categorical_cols = 0.0
+scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
+if(prob_categorical > 0){
+  categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+  categorical_ind = ppred(categorical_ind, prob_categorical, "<")
+  categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows")
+  num_categorical_cols = sum(categorical_ind)
+  write(categorical_col_ids, $out_categorical, format="csv")
+  
+  domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform")
+  domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes)
+  
+  categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform")
+  categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
+
+  scalar_ind = 1-categorical_ind
+}
+
+scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows")
+num_scalar_cols = sum(scalar_ind)
+scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform")
+  
+if(num_categorical_cols > 0 & num_scalar_cols > 0){
+  X = append(scalar_X, categorical_X)
+  permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols)
+  fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
+  permut_mat = t(append(t(permut_mat), t(fill_in)))
+  X = X %*% permut_mat
+}else{
+  if(num_categorical_cols > 0) X = categorical_X
+  else{
+    if(num_scalar_cols > 0) X = scalar_X
+    else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale")
+  }
+}
+
+if(prob_missing_col > 0){
+  missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+  missing_col_ind = ppred(missing_col_ind, prob_missing_col, "<")
+  #currently only support missing value imputation for scale cols
+  missing_col_ind = missing_col_ind * scalar_ind
+  missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows")
+  missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform")
+  missing_values = ppred(missing_values, prob_missing_val, "<")
+  X = append(X, missing_values)
+  
+  write(missing_col_ids, $out_missing, format="csv")
+}
+
+write(X, $out_X, format="csv")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4Univariate.dml
----------------------------------------------------------------------
diff --git a/scripts/datagen/genRandData4Univariate.dml b/scripts/datagen/genRandData4Univariate.dml
index d3c842c..bcbd528 100644
--- a/scripts/datagen/genRandData4Univariate.dml
+++ b/scripts/datagen/genRandData4Univariate.dml
@@ -1,61 +1,61 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random numbers from a distribution
-# with specified mean, standard deviation, 
-# skewness, kurtosis
-# mean and standard deviation are taken in as
-# arguments by this script
-# a,b,c,d are coefficients computed by some
-# equation solver determined from the specified
-# skewness and kurtosis using power method
-# polynomials
-#
-# for more details see:
-# Statistical Simulation: Power Method Polynomials
-# and Other Transformations
-# Author: Todd C. Headrick
-# Chapman & Hall/CRC, Boca Raton, FL, 2010.
-# ISBN 978-1-4200-6490-2
-
-# $1 is the number of random points to be sampled
-# $2 is specified mean
-# $3 is specified standard deviation
-# $4-$7 are a,b,c,d obtained by solving a system
-# of equations using specified kurtosis and skewness
-# $8 is the file to write out the generated data to
-
-numSamples = $1
-mu = $2
-sigma = $3
-a = $4
-b = $5
-c = $6
-d = $7
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random numbers from a distribution
+# with specified mean, standard deviation, 
+# skewness, kurtosis
+# mean and standard deviation are taken in as
+# arguments by this script
+# a,b,c,d are coefficients computed by some
+# equation solver determined from the specified
+# skewness and kurtosis using power method
+# polynomials
+#
+# for more details see:
+# Statistical Simulation: Power Method Polynomials
+# and Other Transformations
+# Author: Todd C. Headrick
+# Chapman & Hall/CRC, Boca Raton, FL, 2010.
+# ISBN 978-1-4200-6490-2
+
+# $1 is the number of random points to be sampled
+# $2 is specified mean
+# $3 is specified standard deviation
+# $4-$7 are a,b,c,d obtained by solving a system
+# of equations using specified kurtosis and skewness
+# $8 is the file to write out the generated data to
+
+numSamples = $1
+mu = $2
+sigma = $3
+a = $4
+b = $5
+c = $6
+d = $7
+
 
 print("a=" + a + " b=" + b + " c=" + c + " d=" + d)
 
-X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
-Y = a + b*X + c*X^2 + d*X^3
-
-Z = Y*sigma + mu
-write(Z, $8, format="binary")
+X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
+Y = a + b*X + c*X^2 + d*X^3
+
+Z = Y*sigma + mu
+write(Z, $8, format="binary")

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/staging/PPCA.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/PPCA.dml b/scripts/staging/PPCA.dml
index 667c709..abfcdb1 100644
--- a/scripts/staging/PPCA.dml
+++ b/scripts/staging/PPCA.dml
@@ -1,160 +1,160 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
- 
-# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. 
-# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed 
-# Platforms. Tarek Elgamal et.al.
-
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME   	TYPE   DEFAULT  MEANING
-# ---------------------------------------------------------------------------------------------
-# X  	 	String ---      location to read the matrix X input matrix
-# k      	Int    ---      indicates dimension of the new vector space constructed from eigen vectors
-# tolobj 	Int    0.00001  objective function tolerance value to stop ppca algorithm
-# tolrecerr	Int    0.02     reconstruction error tolerance value to stop the algorithm	
-# iter   	Int    10       maximum number of iterations
-# fmt    	String 'text'   output format of results PPCA such as "text" or "csv"
-# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X  C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
-# ---------------------------------------------------------------------------------------------
-# OUTPUT PARAMETERS: 
-# ---------------------------------------------------------------------------------------------
-# NAME   TYPE   DEFAULT  MEANING
-# ---------------------------------------------------------------------------------------------
-# C     	Matrix  ---     principal components
-# V      	Matrix  ---     eigenvalues / eigenvalues of principal components
-#
-
-X = read($X);
-
-fileC = $C;
-fileV = $V;
-
-k = ifdef($k, ncol(X));
-iter = ifdef($iter, 10);
-tolobj = ifdef($tolobj, 0.00001);
-tolrecerr = ifdef($tolrecerr, 0.02);
-fmt0 = ifdef($fmt, "text"); 
-
-n = nrow(X);
-m = ncol(X);
-
-#initializing principal components matrix
-C =  rand(rows=m, cols=k, pdf="normal");
-ss = rand(rows=1, cols=1, pdf="normal");
-ss = as.scalar(ss);
-ssPrev = ss;
-
-# best selected principle components - with the lowest reconstruction error 
-PC = C;
-
-# initilizing reconstruction error
-RE = tolrecerr+1;
-REBest = RE;
-
-Z = matrix(0,rows=1,cols=1);
-
-#Objective function value
-ObjRelChng = tolobj+1;
-
-# mean centered input matrix - dim -> [n,m]
-Xm = X - colMeans(X);
-
-#I -> k x k
-ITMP = matrix(1,rows=k,cols=1);
-I = diag(ITMP);
-
-i = 0;
-while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
-	#Estimation step - Covariance matrix 
-	#M -> k x k
-	M = t(C) %*% C + I*ss; 
-	
-	#Auxilary matrix with n latent variables 
-	# Z -> n x k		
-	Z = Xm %*% (C %*% inv(M)); 
-
-	#ZtZ -> k x k
-	ZtZ = t(Z) %*% Z + inv(M)*ss;
-	
-	#XtZ -> m x k
-	XtZ = t(Xm) %*% Z;
-	
-	#Maximization step
-	#C ->  m x k
-	ZtZ_sum = sum(ZtZ); #+n*inv(M)); 
-	C = XtZ/ZtZ_sum;
-
-	#ss2 -> 1 x 1
-	ss2 = trace(ZtZ * (t(C) %*% C));
-
-	#ss3 -> 1 x 1 
-	ss3 = sum((Z %*% t(C)) %*% t(Xm));
-	
-	#Frobenius norm of reconstruction error -> Euclidean norm 
-	#Fn -> 1 x 1	
-	Fn = sum(Xm*Xm);
-
-	#ss -> 1 x 1
-	ss = (Fn + ss2 - 2*ss3)/(n*m);
-
-   #calculating objective function relative change
-   ObjRelChng = abs(1 - ss/ssPrev);
-   #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
-
-	#Reconstruction error
-	R = ((Z %*% t(C)) -  Xm);	
-
-	#calculate the error
-	#TODO rethink calculation of reconstruction error .... 
-	#1-Norm of reconstruction error - a big dense matrix 
-	#RE -> n x m
-	RE = abs(sum(R)/sum(Xm));	
-	if (RE < REBest){
-		PC = C;
-		REBest = RE;
-	}	
-	#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2  +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
-
-	ssPrev = ss;	
-	i = i+1;
-}
-print("Objective Relative Change: " + ObjRelChng);
-print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
-
-# reconstructs data
-# RD -> n x k
-RD = X %*% PC;
-
-# calculate eigenvalues - principle component variance
-RDMean = colMeans(RD);
-V = t(colMeans(RD*RD) - (RDMean*RDMean));
-
-# sorting eigenvalues and eigenvectors in decreasing order
-V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
-VF_decr = table(seq(1,nrow(V)),V_decr_idx);
-V = VF_decr %*% V;
-PC = PC %*% VF_decr;
-
-# writing principal components 
-write(PC, fileC, format=fmt0);
-# writing eigen values/pc variance
-write(V, fileV, format=fmt0);
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+ 
+# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. 
+# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed 
+# Platforms. Tarek Elgamal et.al.
+
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME   	TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# X  	 	String ---      location to read the matrix X input matrix
+# k      	Int    ---      indicates dimension of the new vector space constructed from eigen vectors
+# tolobj 	Int    0.00001  objective function tolerance value to stop ppca algorithm
+# tolrecerr	Int    0.02     reconstruction error tolerance value to stop the algorithm	
+# iter   	Int    10       maximum number of iterations
+# fmt    	String 'text'   output format of results PPCA such as "text" or "csv"
+# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X  C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100
+# ---------------------------------------------------------------------------------------------
+# OUTPUT PARAMETERS: 
+# ---------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# ---------------------------------------------------------------------------------------------
+# C     	Matrix  ---     principal components
+# V      	Matrix  ---     eigenvalues / eigenvalues of principal components
+#
+
+X = read($X);
+
+fileC = $C;
+fileV = $V;
+
+k = ifdef($k, ncol(X));
+iter = ifdef($iter, 10);
+tolobj = ifdef($tolobj, 0.00001);
+tolrecerr = ifdef($tolrecerr, 0.02);
+fmt0 = ifdef($fmt, "text"); 
+
+n = nrow(X);
+m = ncol(X);
+
+#initializing principal components matrix
+C =  rand(rows=m, cols=k, pdf="normal");
+ss = rand(rows=1, cols=1, pdf="normal");
+ss = as.scalar(ss);
+ssPrev = ss;
+
+# best selected principle components - with the lowest reconstruction error 
+PC = C;
+
+# initilizing reconstruction error
+RE = tolrecerr+1;
+REBest = RE;
+
+Z = matrix(0,rows=1,cols=1);
+
+#Objective function value
+ObjRelChng = tolobj+1;
+
+# mean centered input matrix - dim -> [n,m]
+Xm = X - colMeans(X);
+
+#I -> k x k
+ITMP = matrix(1,rows=k,cols=1);
+I = diag(ITMP);
+
+i = 0;
+while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){
+	#Estimation step - Covariance matrix 
+	#M -> k x k
+	M = t(C) %*% C + I*ss; 
+	
+	#Auxilary matrix with n latent variables 
+	# Z -> n x k		
+	Z = Xm %*% (C %*% inv(M)); 
+
+	#ZtZ -> k x k
+	ZtZ = t(Z) %*% Z + inv(M)*ss;
+	
+	#XtZ -> m x k
+	XtZ = t(Xm) %*% Z;
+	
+	#Maximization step
+	#C ->  m x k
+	ZtZ_sum = sum(ZtZ); #+n*inv(M)); 
+	C = XtZ/ZtZ_sum;
+
+	#ss2 -> 1 x 1
+	ss2 = trace(ZtZ * (t(C) %*% C));
+
+	#ss3 -> 1 x 1 
+	ss3 = sum((Z %*% t(C)) %*% t(Xm));
+	
+	#Frobenius norm of reconstruction error -> Euclidean norm 
+	#Fn -> 1 x 1	
+	Fn = sum(Xm*Xm);
+
+	#ss -> 1 x 1
+	ss = (Fn + ss2 - 2*ss3)/(n*m);
+
+   #calculating objective function relative change
+   ObjRelChng = abs(1 - ss/ssPrev);
+   #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss);
+
+	#Reconstruction error
+	R = ((Z %*% t(C)) -  Xm);	
+
+	#calculate the error
+	#TODO rethink calculation of reconstruction error .... 
+	#1-Norm of reconstruction error - a big dense matrix 
+	#RE -> n x m
+	RE = abs(sum(R)/sum(Xm));	
+	if (RE < REBest){
+		PC = C;
+		REBest = RE;
+	}	
+	#print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2  +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE);
+
+	ssPrev = ss;	
+	i = i+1;
+}
+print("Objective Relative Change: " + ObjRelChng);
+print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest);
+
+# reconstructs data
+# RD -> n x k
+RD = X %*% PC;
+
+# calculate eigenvalues - principle component variance
+RDMean = colMeans(RD);
+V = t(colMeans(RD*RD) - (RDMean*RDMean));
+
+# sorting eigenvalues and eigenvectors in decreasing order
+V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE);
+VF_decr = table(seq(1,nrow(V)),V_decr_idx);
+V = VF_decr %*% V;
+PC = PC %*% VF_decr;
+
+# writing principal components 
+write(PC, fileC, format=fmt0);
+# writing eigen values/pc variance
+write(V, fileV, format=fmt0);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/staging/regression/lasso/lasso.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/regression/lasso/lasso.dml b/scripts/staging/regression/lasso/lasso.dml
index fb520df..1da88d3 100644
--- a/scripts/staging/regression/lasso/lasso.dml
+++ b/scripts/staging/regression/lasso/lasso.dml
@@ -1,113 +1,113 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#uses the sparsa algorithm to perform lasso regression
-
-X = read($X)
-y = read($Y)
-n = nrow(X)
-m = ncol(X)
-
-#params
-tol = 10^(-15)
-M = 5
-tau = 1
-maxiter = 1000
-
-#constants
-eta = 2
-sigma = 0.01
-alpha_min = 10^(-30)
-alpha_max = 10^(30)
-
-#init
-alpha = 1
-w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform")
-history = -1*10^30 * matrix(1, rows=M, cols=1)
-
-r = X %*% w - y
-g = t(X) %*% r
-obj = 0.5 * sum(r*r) + tau*sum(abs(w))
-
-print("Initial OBJ=" + obj)
-
-history[M,1] = obj
-
-inactive_set = matrix(1, rows=m, cols=1)
-iter = 0
-continue = TRUE
-while(iter < maxiter & continue) {
-	dw = matrix(0, rows=m, cols=1)
-	dg = matrix(0, rows=m, cols=1)
-	relChangeObj = -1.0
-	
-	inner_iter = 0
-	inner_continue = TRUE
-	inner_maxiter = 100
-	while(inner_iter < inner_maxiter & inner_continue) {
-		u = w - g/alpha
-		lambda = tau/alpha
-		
-		wnew = sign(u) * (abs(u) - lambda) * ppred(abs(u) - lambda, 0, ">")
-		dw = wnew - w
-		dw2 = sum(dw*dw)
-		
-		r = X %*% wnew - y
-		gnew = t(X) %*% r
-		objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew))		
-		obj_threshold = max(history) - 0.5*sigma*alpha*dw2
-		
-		if(objnew <= obj_threshold) {
-			w = wnew
-			dg = gnew - g
-			g = gnew
-			inner_continue = FALSE
-			
-			history[1:(M-1),] = history[2:M,]
-			history[M,1] = objnew
-			relChangeObj = abs(objnew - obj)/obj
-			obj = objnew
-		}
-		else 
-			alpha = eta*alpha
-	
-		inner_iter = inner_iter + 1
-	}
-	
-	if(inner_continue) 
-		print("Inner loop did not converge")
-	
-	alphanew = sum(dw*dg)/sum(dw*dw)
-	alpha = max(alpha_min, min(alpha_max, alphanew))
-	
-	old_inactive_set = inactive_set
-	inactive_set = ppred(w, 0, "!=")
-	diff = sum(abs(old_inactive_set - inactive_set))
-
-	if(diff == 0 & relChangeObj < tol) 
-		continue = FALSE
-
-	num_inactive = sum(ppred(w, 0, "!="))
-	print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive)
-	iter = iter + 1
-}
-
-write(w, $model)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#uses the sparsa algorithm to perform lasso regression
+
+X = read($X)
+y = read($Y)
+n = nrow(X)
+m = ncol(X)
+
+#params
+tol = 10^(-15)
+M = 5
+tau = 1
+maxiter = 1000
+
+#constants
+eta = 2
+sigma = 0.01
+alpha_min = 10^(-30)
+alpha_max = 10^(30)
+
+#init
+alpha = 1
+w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform")
+history = -1*10^30 * matrix(1, rows=M, cols=1)
+
+r = X %*% w - y
+g = t(X) %*% r
+obj = 0.5 * sum(r*r) + tau*sum(abs(w))
+
+print("Initial OBJ=" + obj)
+
+history[M,1] = obj
+
+inactive_set = matrix(1, rows=m, cols=1)
+iter = 0
+continue = TRUE
+while(iter < maxiter & continue) {
+	dw = matrix(0, rows=m, cols=1)
+	dg = matrix(0, rows=m, cols=1)
+	relChangeObj = -1.0
+	
+	inner_iter = 0
+	inner_continue = TRUE
+	inner_maxiter = 100
+	while(inner_iter < inner_maxiter & inner_continue) {
+		u = w - g/alpha
+		lambda = tau/alpha
+		
+		wnew = sign(u) * (abs(u) - lambda) * ppred(abs(u) - lambda, 0, ">")
+		dw = wnew - w
+		dw2 = sum(dw*dw)
+		
+		r = X %*% wnew - y
+		gnew = t(X) %*% r
+		objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew))		
+		obj_threshold = max(history) - 0.5*sigma*alpha*dw2
+		
+		if(objnew <= obj_threshold) {
+			w = wnew
+			dg = gnew - g
+			g = gnew
+			inner_continue = FALSE
+			
+			history[1:(M-1),] = history[2:M,]
+			history[M,1] = objnew
+			relChangeObj = abs(objnew - obj)/obj
+			obj = objnew
+		}
+		else 
+			alpha = eta*alpha
+	
+		inner_iter = inner_iter + 1
+	}
+	
+	if(inner_continue) 
+		print("Inner loop did not converge")
+	
+	alphanew = sum(dw*dg)/sum(dw*dw)
+	alpha = max(alpha_min, min(alpha_max, alphanew))
+	
+	old_inactive_set = inactive_set
+	inactive_set = ppred(w, 0, "!=")
+	diff = sum(abs(old_inactive_set - inactive_set))
+
+	if(diff == 0 & relChangeObj < tol) 
+		continue = FALSE
+
+	num_inactive = sum(ppred(w, 0, "!="))
+	print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive)
+	iter = iter + 1
+}
+
+write(w, $model)