You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:33:56 UTC
[20/51] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/m-svm/m-svm.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/m-svm/m-svm.dml b/src/test/scripts/applications/m-svm/m-svm.dml
index 1307707..bbf5acc 100644
--- a/src/test/scripts/applications/m-svm/m-svm.dml
+++ b/src/test/scripts/applications/m-svm/m-svm.dml
@@ -1,145 +1,145 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multiclass SVM with squared slack variables,
-# learns one-against-the-rest binary-class classifiers
-#
-# Example Usage:
-# Assume SVM_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100
-#
-# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
-#
-
-cmdLine_fmt=ifdef($fmt, "text")
-cmdLine_icpt = ifdef($icpt, 0)
-cmdLine_tol=ifdef($tol, 0.001)
-cmdLine_reg=ifdef($reg, 1.0)
-cmdLine_maxiter=ifdef($maxiter, 100)
-
-print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter)
-
-X = read($X)
-
-check_X = sum(X)
-if(check_X == 0){
- print("X has no non-zeros")
-}else{
- Y = read($Y)
- intercept = cmdLine_icpt
- num_classes = $classes
- epsilon = cmdLine_tol
- lambda = cmdLine_reg
- max_iterations = cmdLine_maxiter
-
- num_samples = nrow(X)
- num_features = ncol(X)
-
- if (intercept == 1) {
- ones = matrix(1, rows=num_samples, cols=1);
- X = append(X, ones);
- }
-
- num_rows_in_w = num_features
- if(intercept == 1){
- num_rows_in_w = num_rows_in_w + 1
- }
- w = matrix(0, rows=num_rows_in_w, cols=num_classes)
-
- debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
- parfor(iter_class in 1:num_classes){
- Y_local = 2 * ppred(Y, iter_class, "==") - 1
- w_class = matrix(0, rows=num_features, cols=1)
- if (intercept == 1) {
- zero_matrix = matrix(0, rows=1, cols=1);
- w_class = t(append(t(w_class), zero_matrix));
- }
-
- g_old = t(X) %*% Y_local
- s = g_old
-
- Xw = matrix(0, rows=nrow(X), cols=1)
- iter = 0
- continue = 1
- while(continue == 1) {
- # minimizing primal obj along direction s
- step_sz = 0
- Xd = X %*% s
- wd = lambda * sum(w_class * s)
- dd = lambda * sum(s * s)
- continue1 = 1
- while(continue1 == 1){
- tmp_Xw = Xw + step_sz*Xd
- out = 1 - Y_local * (tmp_Xw)
- sv = ppred(out, 0, ">")
- out = out * sv
- g = wd + step_sz*dd - sum(out * Y_local * Xd)
- h = dd + sum(Xd * sv * Xd)
- step_sz = step_sz - g/h
- if (g*g/h < 0.0000000001){
- continue1 = 0
- }
- }
-
- #update weights
- w_class = w_class + step_sz*s
- Xw = Xw + step_sz*Xd
-
- out = 1 - Y_local * Xw
- sv = ppred(out, 0, ">")
- out = sv * out
- obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
- g_new = t(X) %*% (out * Y_local) - lambda * w_class
-
- tmp = sum(s * g_old)
-
- train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100
- print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
- debug_mat[iter+1,iter_class] = obj
-
- if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
- continue = 0
- }
-
- #non-linear CG step
- be = sum(g_new * g_new)/sum(g_old * g_old)
- s = be * s + g_new
- g_old = g_new
-
- iter = iter + 1
- }
-
- w[,iter_class] = w_class
- }
-
- write(w, $model, format=cmdLine_fmt)
-
- debug_str = "# Class, Iter, Obj"
- for(iter_class in 1:ncol(debug_mat)){
- for(iter in 1:nrow(debug_mat)){
- obj = castAsScalar(debug_mat[iter, iter_class])
- if(obj != -1)
- debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
- }
- }
- write(debug_str, $Log)
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multiclass SVM with squared slack variables,
+# learns one-against-the-rest binary-class classifiers
+#
+# Example Usage:
+# Assume SVM_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100
+#
+# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
+#
+
+cmdLine_fmt=ifdef($fmt, "text")
+cmdLine_icpt = ifdef($icpt, 0)
+cmdLine_tol=ifdef($tol, 0.001)
+cmdLine_reg=ifdef($reg, 1.0)
+cmdLine_maxiter=ifdef($maxiter, 100)
+
+print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter)
+
+X = read($X)
+
+check_X = sum(X)
+if(check_X == 0){
+ print("X has no non-zeros")
+}else{
+ Y = read($Y)
+ intercept = cmdLine_icpt
+ num_classes = $classes
+ epsilon = cmdLine_tol
+ lambda = cmdLine_reg
+ max_iterations = cmdLine_maxiter
+
+ num_samples = nrow(X)
+ num_features = ncol(X)
+
+ if (intercept == 1) {
+ ones = matrix(1, rows=num_samples, cols=1);
+ X = append(X, ones);
+ }
+
+ num_rows_in_w = num_features
+ if(intercept == 1){
+ num_rows_in_w = num_rows_in_w + 1
+ }
+ w = matrix(0, rows=num_rows_in_w, cols=num_classes)
+
+ debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
+ parfor(iter_class in 1:num_classes){
+ Y_local = 2 * ppred(Y, iter_class, "==") - 1
+ w_class = matrix(0, rows=num_features, cols=1)
+ if (intercept == 1) {
+ zero_matrix = matrix(0, rows=1, cols=1);
+ w_class = t(append(t(w_class), zero_matrix));
+ }
+
+ g_old = t(X) %*% Y_local
+ s = g_old
+
+ Xw = matrix(0, rows=nrow(X), cols=1)
+ iter = 0
+ continue = 1
+ while(continue == 1) {
+ # minimizing primal obj along direction s
+ step_sz = 0
+ Xd = X %*% s
+ wd = lambda * sum(w_class * s)
+ dd = lambda * sum(s * s)
+ continue1 = 1
+ while(continue1 == 1){
+ tmp_Xw = Xw + step_sz*Xd
+ out = 1 - Y_local * (tmp_Xw)
+ sv = ppred(out, 0, ">")
+ out = out * sv
+ g = wd + step_sz*dd - sum(out * Y_local * Xd)
+ h = dd + sum(Xd * sv * Xd)
+ step_sz = step_sz - g/h
+ if (g*g/h < 0.0000000001){
+ continue1 = 0
+ }
+ }
+
+ #update weights
+ w_class = w_class + step_sz*s
+ Xw = Xw + step_sz*Xd
+
+ out = 1 - Y_local * Xw
+ sv = ppred(out, 0, ">")
+ out = sv * out
+ obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
+ g_new = t(X) %*% (out * Y_local) - lambda * w_class
+
+ tmp = sum(s * g_old)
+
+ train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100
+ print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
+ debug_mat[iter+1,iter_class] = obj
+
+ if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
+ continue = 0
+ }
+
+ #non-linear CG step
+ be = sum(g_new * g_new)/sum(g_old * g_old)
+ s = be * s + g_new
+ g_old = g_new
+
+ iter = iter + 1
+ }
+
+ w[,iter_class] = w_class
+ }
+
+ write(w, $model, format=cmdLine_fmt)
+
+ debug_str = "# Class, Iter, Obj"
+ for(iter_class in 1:ncol(debug_mat)){
+ for(iter in 1:nrow(debug_mat)){
+ obj = castAsScalar(debug_mat[iter, iter_class])
+ if(obj != -1)
+ debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
+ }
+ }
+ write(debug_str, $Log)
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/m-svm/m-svm.pydml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/m-svm/m-svm.pydml b/src/test/scripts/applications/m-svm/m-svm.pydml
index 83f17cf..348f599 100644
--- a/src/test/scripts/applications/m-svm/m-svm.pydml
+++ b/src/test/scripts/applications/m-svm/m-svm.pydml
@@ -1,136 +1,136 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multiclass SVM with squared slack variables,
-# learns one-against-the-rest binary-class classifiers
-#
-# Example Usage:
-# Assume SVM_HOME is set to the home of the pydml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100
-#
-# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.pydml -python -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
-#
-
-cmdLine_fmt=ifdef($fmt, "text")
-cmdLine_icpt = ifdef($icpt, 0)
-cmdLine_tol=ifdef($tol, 0.001)
-cmdLine_reg=ifdef($reg, 1.0)
-cmdLine_maxiter=ifdef($maxiter, 100)
-
-print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter)
-
-X = load($X)
-
-check_X = sum(X)
-if(check_X == 0):
- print("X has no non-zeros")
-else:
- Y = load($Y)
- intercept = cmdLine_icpt
- num_classes = $classes
- epsilon = cmdLine_tol
- lambda = cmdLine_reg
- max_iterations = cmdLine_maxiter
-
- num_samples = nrow(X)
- num_features = ncol(X)
-
- if (intercept == 1):
- ones = full(1, rows=num_samples, cols=1)
- X = append(X, ones)
-
- num_rows_in_w = num_features
- if(intercept == 1):
- num_rows_in_w = num_rows_in_w + 1
- w = full(0, rows=num_rows_in_w, cols=num_classes)
-
- debug_mat = full(-1, rows=max_iterations, cols=num_classes)
- parfor(iter_class in 1:num_classes):
- Y_local = 2 * ppred(Y, iter_class, "==") - 1
- w_class = full(0, rows=num_features, cols=1)
- if (intercept == 1):
- zero_matrix = full(0, rows=1, cols=1)
- w_class = transpose(append(transpose(w_class), zero_matrix))
- g_old = dot(transpose(X), Y_local)
- s = g_old
-
- Xw = full(0, rows=nrow(X), cols=1)
- iter = 0
- continue = 1
- while(continue == 1):
- # minimizing primal obj along direction s
- step_sz = 0
- Xd = dot(X, s)
- wd = lambda * sum(w_class * s)
- dd = lambda * sum(s * s)
- continue1 = 1
- while(continue1 == 1):
- tmp_Xw = Xw + step_sz*Xd
- out = 1 - Y_local * (tmp_Xw)
- sv = ppred(out, 0, ">")
- out = out * sv
- g = wd + step_sz*dd - sum(out * Y_local * Xd)
- h = dd + sum(Xd * sv * Xd)
- step_sz = step_sz - g/h
- if (g*g/h < 0.0000000001):
- continue1 = 0
-
- #update weights
- w_class = w_class + step_sz*s
- Xw = Xw + step_sz*Xd
-
- out = 1 - Y_local * Xw
- sv = ppred(out, 0, ">")
- out = sv * out
- obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
- g_new = dot(transpose(X), (out * Y_local)) - lambda * w_class
-
- tmp = sum(s * g_old)
-
- train_acc = sum(ppred(Y_local*(dot(X, w_class)), 0, ">="))/num_samples*100
- print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
- debug_mat[iter+1,iter_class] = obj
-
- if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)):
- continue = 0
-
- #non-linear CG step
- be = sum(g_new * g_new)/sum(g_old * g_old)
- s = be * s + g_new
- g_old = g_new
-
- iter = iter + 1
- # end while(continue == 1)
-
- w[,iter_class] = w_class
- # end parfor(iter_class in 1:num_classes)
-
- save(w, $model, format=cmdLine_fmt)
-
- debug_str = "# Class, Iter, Obj"
- for(iter_class in 1:ncol(debug_mat)):
- for(iter in 1:nrow(debug_mat)):
- obj = castAsScalar(debug_mat[iter, iter_class])
- if(obj != -1):
- debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
- save(debug_str, $Log)
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multiclass SVM with squared slack variables,
+# learns one-against-the-rest binary-class classifiers
+#
+# Example Usage:
+# Assume SVM_HOME is set to the home of the pydml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100
+#
+# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.pydml -python -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
+#
+
+cmdLine_fmt=ifdef($fmt, "text")
+cmdLine_icpt = ifdef($icpt, 0)
+cmdLine_tol=ifdef($tol, 0.001)
+cmdLine_reg=ifdef($reg, 1.0)
+cmdLine_maxiter=ifdef($maxiter, 100)
+
+print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter)
+
+X = load($X)
+
+check_X = sum(X)
+if(check_X == 0):
+ print("X has no non-zeros")
+else:
+ Y = load($Y)
+ intercept = cmdLine_icpt
+ num_classes = $classes
+ epsilon = cmdLine_tol
+ lambda = cmdLine_reg
+ max_iterations = cmdLine_maxiter
+
+ num_samples = nrow(X)
+ num_features = ncol(X)
+
+ if (intercept == 1):
+ ones = full(1, rows=num_samples, cols=1)
+ X = append(X, ones)
+
+ num_rows_in_w = num_features
+ if(intercept == 1):
+ num_rows_in_w = num_rows_in_w + 1
+ w = full(0, rows=num_rows_in_w, cols=num_classes)
+
+ debug_mat = full(-1, rows=max_iterations, cols=num_classes)
+ parfor(iter_class in 1:num_classes):
+ Y_local = 2 * ppred(Y, iter_class, "==") - 1
+ w_class = full(0, rows=num_features, cols=1)
+ if (intercept == 1):
+ zero_matrix = full(0, rows=1, cols=1)
+ w_class = transpose(append(transpose(w_class), zero_matrix))
+ g_old = dot(transpose(X), Y_local)
+ s = g_old
+
+ Xw = full(0, rows=nrow(X), cols=1)
+ iter = 0
+ continue = 1
+ while(continue == 1):
+ # minimizing primal obj along direction s
+ step_sz = 0
+ Xd = dot(X, s)
+ wd = lambda * sum(w_class * s)
+ dd = lambda * sum(s * s)
+ continue1 = 1
+ while(continue1 == 1):
+ tmp_Xw = Xw + step_sz*Xd
+ out = 1 - Y_local * (tmp_Xw)
+ sv = ppred(out, 0, ">")
+ out = out * sv
+ g = wd + step_sz*dd - sum(out * Y_local * Xd)
+ h = dd + sum(Xd * sv * Xd)
+ step_sz = step_sz - g/h
+ if (g*g/h < 0.0000000001):
+ continue1 = 0
+
+ #update weights
+ w_class = w_class + step_sz*s
+ Xw = Xw + step_sz*Xd
+
+ out = 1 - Y_local * Xw
+ sv = ppred(out, 0, ">")
+ out = sv * out
+ obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
+ g_new = dot(transpose(X), (out * Y_local)) - lambda * w_class
+
+ tmp = sum(s * g_old)
+
+ train_acc = sum(ppred(Y_local*(dot(X, w_class)), 0, ">="))/num_samples*100
+ print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
+ debug_mat[iter+1,iter_class] = obj
+
+ if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)):
+ continue = 0
+
+ #non-linear CG step
+ be = sum(g_new * g_new)/sum(g_old * g_old)
+ s = be * s + g_new
+ g_old = g_new
+
+ iter = iter + 1
+ # end while(continue == 1)
+
+ w[,iter_class] = w_class
+ # end parfor(iter_class in 1:num_classes)
+
+ save(w, $model, format=cmdLine_fmt)
+
+ debug_str = "# Class, Iter, Obj"
+ for(iter_class in 1:ncol(debug_mat)):
+ for(iter in 1:nrow(debug_mat)):
+ obj = castAsScalar(debug_mat[iter, iter_class])
+ if(obj != -1):
+ debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
+ save(debug_str, $Log)
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/mdabivar/MDABivariateStats.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/mdabivar/MDABivariateStats.R b/src/test/scripts/applications/mdabivar/MDABivariateStats.R
index 7715c5e..1844bbb 100644
--- a/src/test/scripts/applications/mdabivar/MDABivariateStats.R
+++ b/src/test/scripts/applications/mdabivar/MDABivariateStats.R
@@ -1,294 +1,294 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-bivar_ss = function(X, Y) {
-
- # Unweighted co-variance
- covXY = cov(X,Y)
-
- # compute standard deviations for both X and Y by computing 2^nd central moment
- m2X = var(X)
- m2Y = var(Y)
- sigmaX = sqrt(m2X)
- sigmaY = sqrt(m2Y)
-
- # Pearson's R
- R = covXY / (sigmaX*sigmaY)
-
- return(list("R" = R, "covXY" = covXY, "sigmaX" = sigmaX, "sigmaY" = sigmaY))
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_cc = function(A, B) {
-
- # Contingency Table
- F = table(A,B)
-
- # Chi-Squared
- cst = chisq.test(F)
-
- r = rowSums(F)
- c = colSums(F)
-
- chi_squared = as.numeric(cst[1])
-
- # compute p-value
- pValue = as.numeric(cst[3])
-
- # Assign return values
- pval = pValue
- contingencyTable = F
- rowMarginals = r
- colMarginals = c
-
- return(list("pval" = pval, "contingencyTable" = contingencyTable, "rowMarginals" = rowMarginals, "colMarginals" = colMarginals))
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-# Y points to SCALE variable
-# A points to CATEGORICAL variable
-bivar_sc = function(Y, A) {
- # mean and variance in target variable
- W = length(A)
- my = mean(Y)
- varY = var(Y)
-
- # category-wise (frequencies, means, variances)
- CFreqs = as.matrix(table(A))
-
- CMeans = as.matrix(aggregate(Y, by=list(A), "mean")$x)
-
- CVars = as.matrix(aggregate(Y, by=list(A), "var")$x)
- CVars[is.na(CVars)] <- 0
-
- # number of categories
- R = nrow(CFreqs)
- df1 = R-1
- df2 = W-R
-
- anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1)
- anova_den = sum( (CFreqs-1)*CVars )/(W-R)
- AnovaF = anova_num/anova_den
- pVal = 1-pf(AnovaF, df1, df2)
-
- return(list("pVal" = pVal, "CFreqs" = CFreqs, "CMeans" = CMeans, "CVars" = CVars))
-}
-
-# Main starts here -----------------------------------------------------------------------------------------------------------
-
-args <- commandArgs(TRUE)
-
-library(Matrix)
-
-# input data set
-D = readMM(paste(args[1], "X.mtx", sep=""));
-
-# label attr id (must be a valid index > 0)
-label_index = as.integer(args[2])
-
-# feature attributes, column vector of indices
-feature_indices = readMM(paste(args[1], "feature_indices.mtx", sep=""))
-
-# can be either 1 (scale) or 0 (categorical)
-label_measurement_level = as.integer(args[3])
-
-# measurement levels for features, 0/1 column vector
-feature_measurement_levels = readMM(paste(args[1], "feature_measurement_levels.mtx", sep=""))
-
-sz = ncol(D)
-
-# store for pvalues and pearson's r
-stats = matrix(0, sz, 1)
-# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's
-tests = matrix(0, sz, 1)
-# store for covariances used to compute pearson's r
-covariances = matrix(0, sz, 1)
-# store for standard deviations used to compute pearson's r
-standard_deviations = matrix(0, sz, 1)
-
-labels = D[,label_index]
-
-labelCorrection = 0
-if(label_measurement_level == 1){
- numLabels = length(labels)
- cmLabels = var(labels)
- stdLabels = sqrt(cmLabels)
- standard_deviations[label_index,1] = stdLabels
-}else{
- labelCorrection = 1 - min(labels)
- labels = labels + labelCorrection
-}
-
-mx = apply(D, 2, max)
-mn = apply(D, 2, min)
-num_distinct_values = mx-mn+1
-max_num_distinct_values = 0
-for(i1 in 1:nrow(feature_indices)){
- feature_index1 = feature_indices[i1,1]
- num = num_distinct_values[feature_index1]
- if(feature_measurement_levels[i1,1] == 0 & num >= max_num_distinct_values){
- max_num_distinct_values = num
- }
-}
-distinct_label_values = matrix(0, 1, 1)
-contingencyTableSz = 1
-maxNumberOfGroups = 1
-if(max_num_distinct_values != 0){
- maxNumberOfGroups = max_num_distinct_values
-}
-if(label_measurement_level==0){
- distinct_label_values = as.data.frame(table(labels))$Freq
- if(max_num_distinct_values != 0){
- contingencyTableSz = max_num_distinct_values*length(distinct_label_values)
- }
- maxNumberOfGroups = max(maxNumberOfGroups, length(distinct_label_values))
-}
-# store for contingency table cell values
-contingencyTablesCounts = matrix(0, sz, contingencyTableSz)
-# store for contingency table label(row) assignments
-contingencyTablesLabelValues = matrix(0, sz, contingencyTableSz)
-# store for contingency table feature(col) assignments
-contingencyTablesFeatureValues = matrix(0, sz, contingencyTableSz)
-# store for distinct values
-featureValues = matrix(0, sz, maxNumberOfGroups)
-# store for counts of distinct values
-featureCounts = matrix(0, sz, maxNumberOfGroups)
-# store for group means
-featureMeans = matrix(0, sz, maxNumberOfGroups)
-# store for group standard deviations
-featureSTDs = matrix(0, sz, maxNumberOfGroups)
-
-if(label_measurement_level == 0){
- featureCounts[label_index,1:length(distinct_label_values)] = distinct_label_values
- for(i2 in 1:length(distinct_label_values)){
- featureValues[label_index,i2] = i2-labelCorrection
- }
-}
-
-for(i3 in 1:nrow(feature_indices)){
- feature_index2 = feature_indices[i3,1]
- feature_measurement_level = feature_measurement_levels[i3,1]
-
- feature = D[,feature_index2]
-
- if(feature_measurement_level == 0){
- featureCorrection = 1 - min(feature)
- feature = feature + featureCorrection
-
- if(label_measurement_level == feature_measurement_level){
- # categorical-categorical
- tests[feature_index2,1] = 1
-
- ret = bivar_cc(labels, feature)
- pVal = ret$pval
- contingencyTable = ret$contingencyTable
- rowMarginals = ret$rowMarginals
- colMarginals = ret$colMarginals
-
- stats[feature_index2,1] = pVal
-
- sz3 = nrow(contingencyTable)*ncol(contingencyTable)
-
- contingencyTableCounts = matrix(0, 1, sz3)
- contingencyTableLabelValues = matrix(0, 1, sz3)
- contingencyTableFeatureValues = matrix(0, 1, sz3)
-
- for(i4 in 1:nrow(contingencyTable)){
- for(j in 1:ncol(contingencyTable)){
- #get rid of this, see *1 below
- contingencyTableCounts[1, ncol(contingencyTable)*(i4-1)+j] = contingencyTable[i4,j]
-
- contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection
- contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection
- }
- }
- contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts
-
- contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues
- contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues
-
- featureCounts[feature_index2,1:length(colMarginals)] = colMarginals
- for(i5 in 1:length(colMarginals)){
- featureValues[feature_index2,i5] = i5-featureCorrection
- }
- }else{
- # label is scale, feature is categorical
- tests[feature_index2,1] = 2
-
- ret = bivar_sc(labels, feature)
- pVal = ret$pVal
- frequencies = ret$CFreqs
- means = ret$CMeans
- variances = ret$CVars
-
- stats[feature_index2,1] = pVal
- featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies)
- for(i6 in 1:nrow(frequencies)){
- featureValues[feature_index2,i6] = i6 - featureCorrection
- }
- featureMeans[feature_index2,1:nrow(means)] = t(means)
- featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
- }
- }else{
- if(label_measurement_level == feature_measurement_level){
- # scale-scale
- tests[feature_index2,1] = 3
-
- ret = bivar_ss(labels, feature)
- r = ret$R
- covariance = ret$covXY
- stdX = ret$sigmaX
- stdY = ret$sigmaY
-
- stats[feature_index2,1] = r
- covariances[feature_index2,1] = covariance
- standard_deviations[feature_index2,1] = stdY
- }else{
- # label is categorical, feature is scale
- tests[feature_index2,1] = 2
-
- ret = bivar_sc(feature, labels)
- pVal = ret$pVal
- frequencies = ret$CFreqs
- means = ret$CMeans
- variances = ret$CVars
-
- stats[feature_index2,1] = pVal
- featureMeans[feature_index2,1:nrow(means)] = t(means)
- featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
- }
- }
-}
-
-writeMM(as(stats, "CsparseMatrix"), paste(args[4], "stats", sep=""))
-writeMM(as(tests, "CsparseMatrix"), paste(args[4], "tests", sep=""))
-writeMM(as(covariances, "CsparseMatrix"), paste(args[4], "covariances", sep=""))
-writeMM(as(standard_deviations, "CsparseMatrix"), paste(args[4], "standard_deviations", sep=""))
-writeMM(as(contingencyTablesCounts, "CsparseMatrix"), paste(args[4], "contingencyTablesCounts", sep=""))
-writeMM(as(contingencyTablesLabelValues, "CsparseMatrix"), paste(args[4], "contingencyTablesLabelValues", sep=""))
-writeMM(as(contingencyTablesFeatureValues, "CsparseMatrix"), paste(args[4], "contingencyTablesFeatureValues", sep=""))
-writeMM(as(featureValues, "CsparseMatrix"), paste(args[4], "featureValues", sep=""))
-writeMM(as(featureCounts, "CsparseMatrix"), paste(args[4], "featureCounts", sep=""))
-writeMM(as(featureMeans, "CsparseMatrix"), paste(args[4], "featureMeans", sep=""))
-writeMM(as(featureSTDs, "CsparseMatrix"), paste(args[4], "featureSTDs", sep=""))
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+bivar_ss = function(X, Y) {
+
+ # Unweighted co-variance
+ covXY = cov(X,Y)
+
+ # compute standard deviations for both X and Y by computing 2^nd central moment
+ m2X = var(X)
+ m2Y = var(Y)
+ sigmaX = sqrt(m2X)
+ sigmaY = sqrt(m2Y)
+
+ # Pearson's R
+ R = covXY / (sigmaX*sigmaY)
+
+ return(list("R" = R, "covXY" = covXY, "sigmaX" = sigmaX, "sigmaY" = sigmaY))
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_cc = function(A, B) {
+
+ # Contingency Table
+ F = table(A,B)
+
+ # Chi-Squared
+ cst = chisq.test(F)
+
+ r = rowSums(F)
+ c = colSums(F)
+
+ chi_squared = as.numeric(cst[1])
+
+ # compute p-value
+ pValue = as.numeric(cst[3])
+
+ # Assign return values
+ pval = pValue
+ contingencyTable = F
+ rowMarginals = r
+ colMarginals = c
+
+ return(list("pval" = pval, "contingencyTable" = contingencyTable, "rowMarginals" = rowMarginals, "colMarginals" = colMarginals))
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+# Y points to SCALE variable
+# A points to CATEGORICAL variable
+bivar_sc = function(Y, A) {
+ # mean and variance in target variable
+ W = length(A)
+ my = mean(Y)
+ varY = var(Y)
+
+ # category-wise (frequencies, means, variances)
+ CFreqs = as.matrix(table(A))
+
+ CMeans = as.matrix(aggregate(Y, by=list(A), "mean")$x)
+
+ CVars = as.matrix(aggregate(Y, by=list(A), "var")$x)
+ CVars[is.na(CVars)] <- 0
+
+ # number of categories
+ R = nrow(CFreqs)
+ df1 = R-1
+ df2 = W-R
+
+ anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1)
+ anova_den = sum( (CFreqs-1)*CVars )/(W-R)
+ AnovaF = anova_num/anova_den
+ pVal = 1-pf(AnovaF, df1, df2)
+
+ return(list("pVal" = pVal, "CFreqs" = CFreqs, "CMeans" = CMeans, "CVars" = CVars))
+}
+
+# Main starts here -----------------------------------------------------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+
+library(Matrix)
+
+# input data set
+D = readMM(paste(args[1], "X.mtx", sep=""));
+
+# label attr id (must be a valid index > 0)
+label_index = as.integer(args[2])
+
+# feature attributes, column vector of indices
+feature_indices = readMM(paste(args[1], "feature_indices.mtx", sep=""))
+
+# can be either 1 (scale) or 0 (categorical)
+label_measurement_level = as.integer(args[3])
+
+# measurement levels for features, 0/1 column vector
+feature_measurement_levels = readMM(paste(args[1], "feature_measurement_levels.mtx", sep=""))
+
+sz = ncol(D)
+
+# store for pvalues and pearson's r
+stats = matrix(0, sz, 1)
+# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's
+tests = matrix(0, sz, 1)
+# store for covariances used to compute pearson's r
+covariances = matrix(0, sz, 1)
+# store for standard deviations used to compute pearson's r
+standard_deviations = matrix(0, sz, 1)
+
+labels = D[,label_index]
+
+labelCorrection = 0
+if(label_measurement_level == 1){
+ numLabels = length(labels)
+ cmLabels = var(labels)
+ stdLabels = sqrt(cmLabels)
+ standard_deviations[label_index,1] = stdLabels
+}else{
+ labelCorrection = 1 - min(labels)
+ labels = labels + labelCorrection
+}
+
+mx = apply(D, 2, max)
+mn = apply(D, 2, min)
+num_distinct_values = mx-mn+1
+max_num_distinct_values = 0
+for(i1 in 1:nrow(feature_indices)){
+ feature_index1 = feature_indices[i1,1]
+ num = num_distinct_values[feature_index1]
+ if(feature_measurement_levels[i1,1] == 0 & num >= max_num_distinct_values){
+ max_num_distinct_values = num
+ }
+}
+distinct_label_values = matrix(0, 1, 1)
+contingencyTableSz = 1
+maxNumberOfGroups = 1
+if(max_num_distinct_values != 0){
+ maxNumberOfGroups = max_num_distinct_values
+}
+if(label_measurement_level==0){
+ distinct_label_values = as.data.frame(table(labels))$Freq
+ if(max_num_distinct_values != 0){
+ contingencyTableSz = max_num_distinct_values*length(distinct_label_values)
+ }
+ maxNumberOfGroups = max(maxNumberOfGroups, length(distinct_label_values))
+}
+# store for contingency table cell values
+contingencyTablesCounts = matrix(0, sz, contingencyTableSz)
+# store for contingency table label(row) assignments
+contingencyTablesLabelValues = matrix(0, sz, contingencyTableSz)
+# store for contingency table feature(col) assignments
+contingencyTablesFeatureValues = matrix(0, sz, contingencyTableSz)
+# store for distinct values
+featureValues = matrix(0, sz, maxNumberOfGroups)
+# store for counts of distinct values
+featureCounts = matrix(0, sz, maxNumberOfGroups)
+# store for group means
+featureMeans = matrix(0, sz, maxNumberOfGroups)
+# store for group standard deviations
+featureSTDs = matrix(0, sz, maxNumberOfGroups)
+
+if(label_measurement_level == 0){
+ featureCounts[label_index,1:length(distinct_label_values)] = distinct_label_values
+ for(i2 in 1:length(distinct_label_values)){
+ featureValues[label_index,i2] = i2-labelCorrection
+ }
+}
+
+for(i3 in 1:nrow(feature_indices)){
+ feature_index2 = feature_indices[i3,1]
+ feature_measurement_level = feature_measurement_levels[i3,1]
+
+ feature = D[,feature_index2]
+
+ if(feature_measurement_level == 0){
+ featureCorrection = 1 - min(feature)
+ feature = feature + featureCorrection
+
+ if(label_measurement_level == feature_measurement_level){
+ # categorical-categorical
+ tests[feature_index2,1] = 1
+
+ ret = bivar_cc(labels, feature)
+ pVal = ret$pval
+ contingencyTable = ret$contingencyTable
+ rowMarginals = ret$rowMarginals
+ colMarginals = ret$colMarginals
+
+ stats[feature_index2,1] = pVal
+
+ sz3 = nrow(contingencyTable)*ncol(contingencyTable)
+
+ contingencyTableCounts = matrix(0, 1, sz3)
+ contingencyTableLabelValues = matrix(0, 1, sz3)
+ contingencyTableFeatureValues = matrix(0, 1, sz3)
+
+ for(i4 in 1:nrow(contingencyTable)){
+ for(j in 1:ncol(contingencyTable)){
+ #get rid of this, see *1 below
+ contingencyTableCounts[1, ncol(contingencyTable)*(i4-1)+j] = contingencyTable[i4,j]
+
+ contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection
+ contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection
+ }
+ }
+ contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts
+
+ contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues
+ contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues
+
+ featureCounts[feature_index2,1:length(colMarginals)] = colMarginals
+ for(i5 in 1:length(colMarginals)){
+ featureValues[feature_index2,i5] = i5-featureCorrection
+ }
+ }else{
+ # label is scale, feature is categorical
+ tests[feature_index2,1] = 2
+
+ ret = bivar_sc(labels, feature)
+ pVal = ret$pVal
+ frequencies = ret$CFreqs
+ means = ret$CMeans
+ variances = ret$CVars
+
+ stats[feature_index2,1] = pVal
+ featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies)
+ for(i6 in 1:nrow(frequencies)){
+ featureValues[feature_index2,i6] = i6 - featureCorrection
+ }
+ featureMeans[feature_index2,1:nrow(means)] = t(means)
+ featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
+ }
+ }else{
+ if(label_measurement_level == feature_measurement_level){
+ # scale-scale
+ tests[feature_index2,1] = 3
+
+ ret = bivar_ss(labels, feature)
+ r = ret$R
+ covariance = ret$covXY
+ stdX = ret$sigmaX
+ stdY = ret$sigmaY
+
+ stats[feature_index2,1] = r
+ covariances[feature_index2,1] = covariance
+ standard_deviations[feature_index2,1] = stdY
+ }else{
+ # label is categorical, feature is scale
+ tests[feature_index2,1] = 2
+
+ ret = bivar_sc(feature, labels)
+ pVal = ret$pVal
+ frequencies = ret$CFreqs
+ means = ret$CMeans
+ variances = ret$CVars
+
+ stats[feature_index2,1] = pVal
+ featureMeans[feature_index2,1:nrow(means)] = t(means)
+ featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
+ }
+ }
+}
+
+writeMM(as(stats, "CsparseMatrix"), paste(args[4], "stats", sep=""))
+writeMM(as(tests, "CsparseMatrix"), paste(args[4], "tests", sep=""))
+writeMM(as(covariances, "CsparseMatrix"), paste(args[4], "covariances", sep=""))
+writeMM(as(standard_deviations, "CsparseMatrix"), paste(args[4], "standard_deviations", sep=""))
+writeMM(as(contingencyTablesCounts, "CsparseMatrix"), paste(args[4], "contingencyTablesCounts", sep=""))
+writeMM(as(contingencyTablesLabelValues, "CsparseMatrix"), paste(args[4], "contingencyTablesLabelValues", sep=""))
+writeMM(as(contingencyTablesFeatureValues, "CsparseMatrix"), paste(args[4], "contingencyTablesFeatureValues", sep=""))
+writeMM(as(featureValues, "CsparseMatrix"), paste(args[4], "featureValues", sep=""))
+writeMM(as(featureCounts, "CsparseMatrix"), paste(args[4], "featureCounts", sep=""))
+writeMM(as(featureMeans, "CsparseMatrix"), paste(args[4], "featureMeans", sep=""))
+writeMM(as(featureSTDs, "CsparseMatrix"), paste(args[4], "featureSTDs", sep=""))
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/mdabivar/MDABivariateStats.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/mdabivar/MDABivariateStats.dml b/src/test/scripts/applications/mdabivar/MDABivariateStats.dml
index 1e04154..56163ad 100644
--- a/src/test/scripts/applications/mdabivar/MDABivariateStats.dml
+++ b/src/test/scripts/applications/mdabivar/MDABivariateStats.dml
@@ -1,268 +1,268 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Main starts here -----------------------------------------------------------------------------------------------------------
-
-# input data set
-D = read($1)
-
-# label attr id (must be a valid index > 0)
-label_index = $2
-
-# feature attributes, column vector of indices
-feature_indices = read($3)
-
-# can be either 1 (scale) or 0 (categorical)
-label_measurement_level = $4
-
-# measurement levels for features, 0/1 column vector
-feature_measurement_levels = read($5)
-
-sz = ncol(D)
-
-# store for pvalues and pearson's r
-stats = matrix(0, rows=sz, cols=1)
-# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's
-tests = matrix(0, rows=sz, cols=1)
-# store for covariances used to compute pearson's r
-covariances = matrix(0, rows=sz, cols=1)
-# store for standard deviations used to compute pearson's r
-standard_deviations = matrix(0, rows=sz, cols=1)
-
-labels = D[,label_index]
-
-labelCorrection = 0
-if(label_measurement_level == 1){
- numLabels = nrow(labels)
- cmLabels = moment(labels,2)
- stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) )
- standard_deviations[label_index,1] = stdLabels
-}else{
- labelCorrection = 1 - min(labels)
- labels = labels + labelCorrection
-}
-
-mx = colMaxs(D)
-mn = colMins(D)
-num_distinct_values = mx-mn+1
-max_num_distinct_values = 0
-for(i1 in 1:nrow(feature_indices)){
- feature_index1 = castAsScalar(feature_indices[i1,1])
- num = castAsScalar(num_distinct_values[1,feature_index1])
- if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values){
- max_num_distinct_values = num
- }
-}
-distinct_label_values = matrix(0, rows=1, cols=1)
-contingencyTableSz = 1
-maxNumberOfGroups = 1
-if(max_num_distinct_values != 0){
- maxNumberOfGroups = max_num_distinct_values
-}
-if(label_measurement_level==0){
- distinct_label_values = aggregate(target=labels, groups=labels, fn="count")
- if(max_num_distinct_values != 0){
- contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values)
- }
- maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values))
-}
-# store for contingency table cell values
-contingencyTablesCounts = matrix(0, rows=sz, cols=contingencyTableSz)
-# store for contingency table label(row) assignments
-contingencyTablesLabelValues = matrix(0, rows=sz, cols=contingencyTableSz)
-# store for contingency table feature(col) assignments
-contingencyTablesFeatureValues = matrix(0, rows=sz, cols=contingencyTableSz)
-# store for distinct values
-featureValues = matrix(0, rows=sz, cols=maxNumberOfGroups)
-# store for counts of distinct values
-featureCounts = matrix(0, rows=sz, cols=maxNumberOfGroups)
-# store for group means
-featureMeans = matrix(0, rows=sz, cols=maxNumberOfGroups)
-# store for group standard deviations
-featureSTDs = matrix(0, rows=sz, cols=maxNumberOfGroups)
-
-if(label_measurement_level == 0){
- featureCounts[label_index,1:nrow(distinct_label_values)] = t(distinct_label_values)
- parfor(i2 in 1:nrow(distinct_label_values)){
- featureValues[label_index,i2] = i2-labelCorrection
- }
-}
-
-parfor(i3 in 1:nrow(feature_indices), check=0){
- feature_index2 = castAsScalar(feature_indices[i3,1])
- feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1])
-
- feature = D[,feature_index2]
-
- if(feature_measurement_level == 0){
- featureCorrection = 1 - min(feature)
- feature = feature + featureCorrection
-
- if(label_measurement_level == feature_measurement_level){
- # categorical-categorical
- tests[feature_index2,1] = 1
- [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature)
- stats[feature_index2,1] = pVal
-
- sz3=1
- if(1==1){
- sz3 = nrow(contingencyTable)*ncol(contingencyTable)
- }
- contingencyTableLabelValues = matrix(0, rows=1, cols=sz3)
- contingencyTableFeatureValues = matrix(0, rows=1, cols=sz3)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Main starts here -----------------------------------------------------------------------------------------------------------
+
+# input data set
+D = read($1)
+
+# label attr id (must be a valid index > 0)
+label_index = $2
+
+# feature attributes, column vector of indices
+feature_indices = read($3)
+
+# can be either 1 (scale) or 0 (categorical)
+label_measurement_level = $4
+
+# measurement levels for features, 0/1 column vector
+feature_measurement_levels = read($5)
+
+sz = ncol(D)
+
+# store for pvalues and pearson's r
+stats = matrix(0, rows=sz, cols=1)
+# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's
+tests = matrix(0, rows=sz, cols=1)
+# store for covariances used to compute pearson's r
+covariances = matrix(0, rows=sz, cols=1)
+# store for standard deviations used to compute pearson's r
+standard_deviations = matrix(0, rows=sz, cols=1)
+
+labels = D[,label_index]
+
+labelCorrection = 0
+if(label_measurement_level == 1){
+ numLabels = nrow(labels)
+ cmLabels = moment(labels,2)
+ stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) )
+ standard_deviations[label_index,1] = stdLabels
+}else{
+ labelCorrection = 1 - min(labels)
+ labels = labels + labelCorrection
+}
+
+mx = colMaxs(D)
+mn = colMins(D)
+num_distinct_values = mx-mn+1
+max_num_distinct_values = 0
+for(i1 in 1:nrow(feature_indices)){
+ feature_index1 = castAsScalar(feature_indices[i1,1])
+ num = castAsScalar(num_distinct_values[1,feature_index1])
+ if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values){
+ max_num_distinct_values = num
+ }
+}
+distinct_label_values = matrix(0, rows=1, cols=1)
+contingencyTableSz = 1
+maxNumberOfGroups = 1
+if(max_num_distinct_values != 0){
+ maxNumberOfGroups = max_num_distinct_values
+}
+if(label_measurement_level==0){
+ distinct_label_values = aggregate(target=labels, groups=labels, fn="count")
+ if(max_num_distinct_values != 0){
+ contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values)
+ }
+ maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values))
+}
+# store for contingency table cell values
+contingencyTablesCounts = matrix(0, rows=sz, cols=contingencyTableSz)
+# store for contingency table label(row) assignments
+contingencyTablesLabelValues = matrix(0, rows=sz, cols=contingencyTableSz)
+# store for contingency table feature(col) assignments
+contingencyTablesFeatureValues = matrix(0, rows=sz, cols=contingencyTableSz)
+# store for distinct values
+featureValues = matrix(0, rows=sz, cols=maxNumberOfGroups)
+# store for counts of distinct values
+featureCounts = matrix(0, rows=sz, cols=maxNumberOfGroups)
+# store for group means
+featureMeans = matrix(0, rows=sz, cols=maxNumberOfGroups)
+# store for group standard deviations
+featureSTDs = matrix(0, rows=sz, cols=maxNumberOfGroups)
+
+if(label_measurement_level == 0){
+ featureCounts[label_index,1:nrow(distinct_label_values)] = t(distinct_label_values)
+ parfor(i2 in 1:nrow(distinct_label_values)){
+ featureValues[label_index,i2] = i2-labelCorrection
+ }
+}
+
+parfor(i3 in 1:nrow(feature_indices), check=0){
+ feature_index2 = castAsScalar(feature_indices[i3,1])
+ feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1])
+
+ feature = D[,feature_index2]
+
+ if(feature_measurement_level == 0){
+ featureCorrection = 1 - min(feature)
+ feature = feature + featureCorrection
- parfor(i4 in 1:nrow(contingencyTable), check=0){
- parfor(j in 1:ncol(contingencyTable), check=0){
- contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection
- contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection
- }
- }
+ if(label_measurement_level == feature_measurement_level){
+ # categorical-categorical
+ tests[feature_index2,1] = 1
+ [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature)
+ stats[feature_index2,1] = pVal
+
+ sz3=1
+ if(1==1){
+ sz3 = nrow(contingencyTable)*ncol(contingencyTable)
+ }
+ contingencyTableLabelValues = matrix(0, rows=1, cols=sz3)
+ contingencyTableFeatureValues = matrix(0, rows=1, cols=sz3)
+
+ parfor(i4 in 1:nrow(contingencyTable), check=0){
+ parfor(j in 1:ncol(contingencyTable), check=0){
+ contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection
+ contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection
+ }
+ }
contingencyTableCounts = matrix(contingencyTable, rows=1, cols=sz3, byrow=TRUE)
- contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts
-
- contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues
- contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues
-
- featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals
- parfor(i5 in 1:ncol(colMarginals), check=0){
- featureValues[feature_index2,i5] = i5-featureCorrection
- }
- }else{
- # label is scale, feature is categorical
- tests[feature_index2,1] = 2
- [pVal, frequencies, means, variances] = bivar_sc(labels, feature)
- stats[feature_index2,1] = pVal
- featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies)
- parfor(i6 in 1:nrow(frequencies), check=0){
- featureValues[feature_index2,i6] = i6 - featureCorrection
- }
- featureMeans[feature_index2,1:nrow(means)] = t(means)
- featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
- }
- }else{
- if(label_measurement_level == feature_measurement_level){
- # scale-scale
- tests[feature_index2,1] = 3
- [r, covariance, stdX, stdY] = bivar_ss(labels, feature)
- stats[feature_index2,1] = r
- covariances[feature_index2,1] = covariance
- standard_deviations[feature_index2,1] = stdY
- }else{
- # label is categorical, feature is scale
- tests[feature_index2,1] = 2
- [pVal, frequencies, means, variances] = bivar_sc(feature, labels)
- stats[feature_index2,1] = pVal
- featureMeans[feature_index2,1:nrow(means)] = t(means)
- featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
- }
- }
-}
-
-write(stats, $6, format="text")
-write(tests, $7, format="text")
-write(covariances, $8, format="text")
-write(standard_deviations, $9, format="text")
-write(contingencyTablesCounts, $10, format="text")
-write(contingencyTablesLabelValues, $11, format="text")
-write(contingencyTablesFeatureValues, $12, format="text")
-write(featureValues, $13, format="text")
-write(featureCounts, $14, format="text")
-write(featureMeans, $15, format="text")
-write(featureSTDs, $16, format="text")
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) {
-
- # Unweighted co-variance
- covXY = cov(X,Y)
-
- # compute standard deviations for both X and Y by computing 2^nd central moment
- W = nrow(X)
- m2X = moment(X,2)
- m2Y = moment(Y,2)
- sigmaX = sqrt(m2X * (W/(W-1.0)) )
- sigmaY = sqrt(m2Y * (W/(W-1.0)) )
-
-
- # Pearson's R
- R = covXY / (sigmaX*sigmaY)
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double pval, Matrix[Double] contingencyTable, Matrix[Double] rowMarginals, Matrix[Double] colMarginals) {
-
- # Contingency Table
- FF = table(A,B)
-
+ contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts
+
+ contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues
+ contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues
+
+ featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals
+ parfor(i5 in 1:ncol(colMarginals), check=0){
+ featureValues[feature_index2,i5] = i5-featureCorrection
+ }
+ }else{
+ # label is scale, feature is categorical
+ tests[feature_index2,1] = 2
+ [pVal, frequencies, means, variances] = bivar_sc(labels, feature)
+ stats[feature_index2,1] = pVal
+ featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies)
+ parfor(i6 in 1:nrow(frequencies), check=0){
+ featureValues[feature_index2,i6] = i6 - featureCorrection
+ }
+ featureMeans[feature_index2,1:nrow(means)] = t(means)
+ featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
+ }
+ }else{
+ if(label_measurement_level == feature_measurement_level){
+ # scale-scale
+ tests[feature_index2,1] = 3
+ [r, covariance, stdX, stdY] = bivar_ss(labels, feature)
+ stats[feature_index2,1] = r
+ covariances[feature_index2,1] = covariance
+ standard_deviations[feature_index2,1] = stdY
+ }else{
+ # label is categorical, feature is scale
+ tests[feature_index2,1] = 2
+ [pVal, frequencies, means, variances] = bivar_sc(feature, labels)
+ stats[feature_index2,1] = pVal
+ featureMeans[feature_index2,1:nrow(means)] = t(means)
+ featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances))
+ }
+ }
+}
+
+write(stats, $6, format="text")
+write(tests, $7, format="text")
+write(covariances, $8, format="text")
+write(standard_deviations, $9, format="text")
+write(contingencyTablesCounts, $10, format="text")
+write(contingencyTablesLabelValues, $11, format="text")
+write(contingencyTablesFeatureValues, $12, format="text")
+write(featureValues, $13, format="text")
+write(featureCounts, $14, format="text")
+write(featureMeans, $15, format="text")
+write(featureSTDs, $16, format="text")
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) {
+
+ # Unweighted co-variance
+ covXY = cov(X,Y)
+
+ # compute standard deviations for both X and Y by computing 2^nd central moment
+ W = nrow(X)
+ m2X = moment(X,2)
+ m2Y = moment(Y,2)
+ sigmaX = sqrt(m2X * (W/(W-1.0)) )
+ sigmaY = sqrt(m2Y * (W/(W-1.0)) )
+
+
+ # Pearson's R
+ R = covXY / (sigmaX*sigmaY)
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double pval, Matrix[Double] contingencyTable, Matrix[Double] rowMarginals, Matrix[Double] colMarginals) {
+
+ # Contingency Table
+ FF = table(A,B)
+
tmp = removeEmpty(target=FF, margin="rows");
F = removeEmpty(target=tmp, margin="cols");
- # Chi-Squared
- W = sum(F)
- r = rowSums(F)
- c = colSums(F)
- E = (r %*% c)/W
- E = ppred(E, 0, "==")*0.0001 + E
- T = (F-E)^2/E
- chi_squared = sum(T)
-
- # compute p-value
- degFreedom = (nrow(F)-1)*(ncol(F)-1)
- pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE)
-
-
- # Assign return values
- pval = pValue
- contingencyTable = F
- rowMarginals = r
- colMarginals = c
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-# Y points to SCALE variable
-# A points to CATEGORICAL variable
-bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double pVal, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) {
- # mean and variance in target variable
- W = nrow(A)
- my = mean(Y)
- varY = moment(Y,2) * W/(W-1.0)
-
- # category-wise (frequencies, means, variances)
- CFreqs1 = aggregate(target=Y, groups=A, fn="count")
+ # Chi-Squared
+ W = sum(F)
+ r = rowSums(F)
+ c = colSums(F)
+ E = (r %*% c)/W
+ E = ppred(E, 0, "==")*0.0001 + E
+ T = (F-E)^2/E
+ chi_squared = sum(T)
+
+ # compute p-value
+ degFreedom = (nrow(F)-1)*(ncol(F)-1)
+ pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE)
+
+
+ # Assign return values
+ pval = pValue
+ contingencyTable = F
+ rowMarginals = r
+ colMarginals = c
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+# Y points to SCALE variable
+# A points to CATEGORICAL variable
+bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double pVal, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) {
+ # mean and variance in target variable
+ W = nrow(A)
+ my = mean(Y)
+ varY = moment(Y,2) * W/(W-1.0)
+
+ # category-wise (frequencies, means, variances)
+ CFreqs1 = aggregate(target=Y, groups=A, fn="count")
present_domain_vals_mat = removeEmpty(target=diag(1-ppred(CFreqs1, 0, "==")), margin="rows")
CFreqs = present_domain_vals_mat %*% CFreqs1
- CMeans = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="mean")
- CVars = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="variance")
-
- # number of categories
- R = nrow(CFreqs)
- df1 = R-1
- df2 = W-R
-
- anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1)
- anova_den = sum( (CFreqs-1)*CVars )/(W-R)
- AnovaF = anova_num/anova_den
- pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=FALSE)
-}
+ CMeans = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="mean")
+ CVars = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="variance")
+
+ # number of categories
+ R = nrow(CFreqs)
+ df1 = R-1
+ df2 = W-R
+
+ anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1)
+ anova_den = sum( (CFreqs-1)*CVars )/(W-R)
+ AnovaF = anova_num/anova_den
+ pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=FALSE)
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml b/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml
index c899065..7fbc101 100644
--- a/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml
+++ b/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml
@@ -1,246 +1,246 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Main starts here -----------------------------------------------------------------------------------------------------------
-
-# input data set
-D = load($1)
-
-# label attr id (must be a valid index > 0)
-label_index = $2
-
-# feature attributes, column vector of indices
-feature_indices = load($3)
-
-# can be either 1 (scale) or 0 (categorical)
-label_measurement_level = $4
-
-# measurement levels for features, 0/1 column vector
-feature_measurement_levels = read($5)
-
-sz = ncol(D)
-
-# store for pvalues and pearson's r
-stats = full(0, rows=sz, cols=1)
-# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's
-tests = full(0, rows=sz, cols=1)
-# store for covariances used to compute pearson's r
-covariances = full(0, rows=sz, cols=1)
-# store for standard deviations used to compute pearson's r
-standard_deviations = full(0, rows=sz, cols=1)
-
-labels = D[,label_index]
-
-labelCorrection = 0
-if(label_measurement_level == 1):
- numLabels = nrow(labels)
- cmLabels = moment(labels,2)
- stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) )
- standard_deviations[label_index,1] = stdLabels
-else:
- labelCorrection = 1 - min(labels)
- labels = labels + labelCorrection
-
-mx = colMaxs(D)
-mn = colMins(D)
-num_distinct_values = mx-mn+1
-max_num_distinct_values = 0
-for(i1 in 1:nrow(feature_indices)):
- feature_index1 = castAsScalar(feature_indices[i1,1])
- num = castAsScalar(num_distinct_values[1,feature_index1])
- if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values):
- max_num_distinct_values = num
-distinct_label_values = full(0, rows=1, cols=1)
-contingencyTableSz = 1
-maxNumberOfGroups = 1
-if(max_num_distinct_values != 0):
- maxNumberOfGroups = max_num_distinct_values
-if(label_measurement_level==0):
- distinct_label_values = aggregate(target=labels, groups=labels, fn="count")
- if(max_num_distinct_values != 0):
- contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values)
- maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values))
-# store for contingency table cell values
-contingencyTablesCounts = full(0, rows=sz, cols=contingencyTableSz)
-# store for contingency table label(row) assignments
-contingencyTablesLabelValues = full(0, rows=sz, cols=contingencyTableSz)
-# store for contingency table feature(col) assignments
-contingencyTablesFeatureValues = full(0, rows=sz, cols=contingencyTableSz)
-# store for distinct values
-featureValues = full(0, rows=sz, cols=maxNumberOfGroups)
-# store for counts of distinct values
-featureCounts = full(0, rows=sz, cols=maxNumberOfGroups)
-# store for group means
-featureMeans = full(0, rows=sz, cols=maxNumberOfGroups)
-# store for group standard deviations
-featureSTDs = full(0, rows=sz, cols=maxNumberOfGroups)
-
-if(label_measurement_level == 0):
- featureCounts[label_index,1:nrow(distinct_label_values)] = transpose(distinct_label_values)
- parfor(i2 in 1:nrow(distinct_label_values)):
- featureValues[label_index,i2] = i2-labelCorrection
-
-parfor(i3 in 1:nrow(feature_indices), check=0):
- feature_index2 = castAsScalar(feature_indices[i3,1])
- feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1])
-
- feature = D[,feature_index2]
-
- if(feature_measurement_level == 0):
- featureCorrection = 1 - min(feature)
- feature = feature + featureCorrection
-
- if(label_measurement_level == feature_measurement_level):
- # categorical-categorical
- tests[feature_index2,1] = 1
- [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature)
- stats[feature_index2,1] = pVal
-
- sz3=1
- if(1==1):
- sz3 = nrow(contingencyTable)*ncol(contingencyTable)
- contingencyTableLabelValues = full(0, rows=1, cols=sz3)
- contingencyTableFeatureValues = full(0, rows=1, cols=sz3)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Main starts here -----------------------------------------------------------------------------------------------------------
+
+# input data set
+D = load($1)
+
+# label attr id (must be a valid index > 0)
+label_index = $2
+
+# feature attributes, column vector of indices
+feature_indices = load($3)
+
+# can be either 1 (scale) or 0 (categorical)
+label_measurement_level = $4
+
+# measurement levels for features, 0/1 column vector
+feature_measurement_levels = read($5)
+
+sz = ncol(D)
+
+# store for pvalues and pearson's r
+stats = full(0, rows=sz, cols=1)
+# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's
+tests = full(0, rows=sz, cols=1)
+# store for covariances used to compute pearson's r
+covariances = full(0, rows=sz, cols=1)
+# store for standard deviations used to compute pearson's r
+standard_deviations = full(0, rows=sz, cols=1)
+
+labels = D[,label_index]
+
+labelCorrection = 0
+if(label_measurement_level == 1):
+ numLabels = nrow(labels)
+ cmLabels = moment(labels,2)
+ stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) )
+ standard_deviations[label_index,1] = stdLabels
+else:
+ labelCorrection = 1 - min(labels)
+ labels = labels + labelCorrection
+
+mx = colMaxs(D)
+mn = colMins(D)
+num_distinct_values = mx-mn+1
+max_num_distinct_values = 0
+for(i1 in 1:nrow(feature_indices)):
+ feature_index1 = castAsScalar(feature_indices[i1,1])
+ num = castAsScalar(num_distinct_values[1,feature_index1])
+ if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values):
+ max_num_distinct_values = num
+distinct_label_values = full(0, rows=1, cols=1)
+contingencyTableSz = 1
+maxNumberOfGroups = 1
+if(max_num_distinct_values != 0):
+ maxNumberOfGroups = max_num_distinct_values
+if(label_measurement_level==0):
+ distinct_label_values = aggregate(target=labels, groups=labels, fn="count")
+ if(max_num_distinct_values != 0):
+ contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values)
+ maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values))
+# store for contingency table cell values
+contingencyTablesCounts = full(0, rows=sz, cols=contingencyTableSz)
+# store for contingency table label(row) assignments
+contingencyTablesLabelValues = full(0, rows=sz, cols=contingencyTableSz)
+# store for contingency table feature(col) assignments
+contingencyTablesFeatureValues = full(0, rows=sz, cols=contingencyTableSz)
+# store for distinct values
+featureValues = full(0, rows=sz, cols=maxNumberOfGroups)
+# store for counts of distinct values
+featureCounts = full(0, rows=sz, cols=maxNumberOfGroups)
+# store for group means
+featureMeans = full(0, rows=sz, cols=maxNumberOfGroups)
+# store for group standard deviations
+featureSTDs = full(0, rows=sz, cols=maxNumberOfGroups)
+
+if(label_measurement_level == 0):
+ featureCounts[label_index,1:nrow(distinct_label_values)] = transpose(distinct_label_values)
+ parfor(i2 in 1:nrow(distinct_label_values)):
+ featureValues[label_index,i2] = i2-labelCorrection
+
+parfor(i3 in 1:nrow(feature_indices), check=0):
+ feature_index2 = castAsScalar(feature_indices[i3,1])
+ feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1])
+
+ feature = D[,feature_index2]
+
+ if(feature_measurement_level == 0):
+ featureCorrection = 1 - min(feature)
+ feature = feature + featureCorrection
+
+ if(label_measurement_level == feature_measurement_level):
+ # categorical-categorical
+ tests[feature_index2,1] = 1
+ [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature)
+ stats[feature_index2,1] = pVal
+
+ sz3=1
+ if(1==1):
+ sz3 = nrow(contingencyTable)*ncol(contingencyTable)
+ contingencyTableLabelValues = full(0, rows=1, cols=sz3)
+ contingencyTableFeatureValues = full(0, rows=1, cols=sz3)
- parfor(i4 in 1:nrow(contingencyTable), check=0):
- parfor(j in 1:ncol(contingencyTable), check=0):
- contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection
- contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection
+ parfor(i4 in 1:nrow(contingencyTable), check=0):
+ parfor(j in 1:ncol(contingencyTable), check=0):
+ contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection
+ contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection
contingencyTableCounts = contingencyTable.reshape(rows=1, cols=sz3)
- contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts
-
- contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues
- contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues
-
- featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals
- parfor(i5 in 1:ncol(colMarginals), check=0):
- featureValues[feature_index2,i5] = i5-featureCorrection
- else:
- # label is scale, feature is categorical
- tests[feature_index2,1] = 2
- [pVal, frequencies, means, variances] = bivar_sc(labels, feature)
- stats[feature_index2,1] = pVal
- featureCounts[feature_index2,1:nrow(frequencies)] = transpose(frequencies)
- parfor(i6 in 1:nrow(frequencies), check=0):
- featureValues[feature_index2,i6] = i6 - featureCorrection
- featureMeans[feature_index2,1:nrow(means)] = transpose(means)
- featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances))
- else:
- if(label_measurement_level == feature_measurement_level):
- # scale-scale
- tests[feature_index2,1] = 3
- [r, covariance, stdX, stdY] = bivar_ss(labels, feature)
- stats[feature_index2,1] = r
- covariances[feature_index2,1] = covariance
- standard_deviations[feature_index2,1] = stdY
- else:
- # label is categorical, feature is scale
- tests[feature_index2,1] = 2
- [pVal, frequencies, means, variances] = bivar_sc(feature, labels)
- stats[feature_index2,1] = pVal
- featureMeans[feature_index2,1:nrow(means)] = transpose(means)
- featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances))
- # end if(feature_measurement_level == 0)
-# end parfor(i3 in 1:nrow(feature_indices), check=0)
-
-save(stats, $6, format="text")
-save(tests, $7, format="text")
-save(covariances, $8, format="text")
-save(standard_deviations, $9, format="text")
-save(contingencyTablesCounts, $10, format="text")
-save(contingencyTablesLabelValues, $11, format="text")
-save(contingencyTablesFeatureValues, $12, format="text")
-save(featureValues, $13, format="text")
-save(featureCounts, $14, format="text")
-save(featureMeans, $15, format="text")
-save(featureSTDs, $16, format="text")
-
-# -----------------------------------------------------------------------------------------------------------
-
-def bivar_ss(X:matrix[float], Y:matrix[float]) -> (R:float, covXY:float, sigmaX:float, sigmaY:float):
- # Unweighted co-variance
- covXY = cov(X,Y)
-
- # compute standard deviations for both X and Y by computing 2^nd central moment
- W = nrow(X)
- m2X = moment(X,2)
- m2Y = moment(Y,2)
- sigmaX = sqrt(m2X * (W/(W-1.0)) )
- sigmaY = sqrt(m2Y * (W/(W-1.0)) )
-
- # Pearson's R
- R = covXY / (sigmaX*sigmaY)
-
-# -----------------------------------------------------------------------------------------------------------
-
-def bivar_cc(A:matrix[float], B:matrix[float]) -> (pval:float, contingencyTable:matrix[float], rowMarginals:matrix[float], colMarginals:matrix[float]):
- # Contingency Table
- FF = table(A,B)
-
- tmp = removeEmpty(target=FF, axis=0)
- F = removeEmpty(target=tmp, axis=1)
-
- # Chi-Squared
- W = sum(F)
- r = rowSums(F)
- c = colSums(F)
- E = (dot(r, c))/W
- E = ppred(E, 0, "==")*0.0001 + E
- T = (F-E)**2/E
- chi_squared = sum(T)
- # compute p-value
- degFreedom = (nrow(F)-1)*(ncol(F)-1)
- pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=False)
-
- # Assign return values
- pval = pValue
- contingencyTable = F
- rowMarginals = r
- colMarginals = c
-
-# -----------------------------------------------------------------------------------------------------------
-
-# Y points to SCALE variable
-# A points to CATEGORICAL variable
-def bivar_sc(Y:matrix[float], A:matrix[float]) -> (pVal:float, CFreqs:matrix[float], CMeans:matrix[float], CVars:matrix[float]):
- # mean and variance in target variable
- W = nrow(A)
- my = mean(Y)
- varY = moment(Y,2) * W/(W-1.0)
-
- # category-wise (frequencies, means, variances)
- CFreqs1 = aggregate(target=Y, groups=A, fn="count")
- present_domain_vals_mat = removeEmpty(target=diag(1-ppred(CFreqs1, 0, "==")), axis=0)
- CFreqs = dot(present_domain_vals_mat, CFreqs1)
-
- CMeans = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="mean"))
- CVars = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="variance"))
-
- # number of categories
- R = nrow(CFreqs)
- df1 = R-1
- df2 = W-R
-
- anova_num = sum( (CFreqs*(CMeans-my)**2) )/(R-1)
- anova_den = sum( (CFreqs-1)*CVars )/(W-R)
- AnovaF = anova_num/anova_den
- pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=False)
-
+ contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts
+
+ contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues
+ contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues
+
+ featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals
+ parfor(i5 in 1:ncol(colMarginals), check=0):
+ featureValues[feature_index2,i5] = i5-featureCorrection
+ else:
+ # label is scale, feature is categorical
+ tests[feature_index2,1] = 2
+ [pVal, frequencies, means, variances] = bivar_sc(labels, feature)
+ stats[feature_index2,1] = pVal
+ featureCounts[feature_index2,1:nrow(frequencies)] = transpose(frequencies)
+ parfor(i6 in 1:nrow(frequencies), check=0):
+ featureValues[feature_index2,i6] = i6 - featureCorrection
+ featureMeans[feature_index2,1:nrow(means)] = transpose(means)
+ featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances))
+ else:
+ if(label_measurement_level == feature_measurement_level):
+ # scale-scale
+ tests[feature_index2,1] = 3
+ [r, covariance, stdX, stdY] = bivar_ss(labels, feature)
+ stats[feature_index2,1] = r
+ covariances[feature_index2,1] = covariance
+ standard_deviations[feature_index2,1] = stdY
+ else:
+ # label is categorical, feature is scale
+ tests[feature_index2,1] = 2
+ [pVal, frequencies, means, variances] = bivar_sc(feature, labels)
+ stats[feature_index2,1] = pVal
+ featureMeans[feature_index2,1:nrow(means)] = transpose(means)
+ featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances))
+ # end if(feature_measurement_level == 0)
+# end parfor(i3 in 1:nrow(feature_indices), check=0)
+
+save(stats, $6, format="text")
+save(tests, $7, format="text")
+save(covariances, $8, format="text")
+save(standard_deviations, $9, format="text")
+save(contingencyTablesCounts, $10, format="text")
+save(contingencyTablesLabelValues, $11, format="text")
+save(contingencyTablesFeatureValues, $12, format="text")
+save(featureValues, $13, format="text")
+save(featureCounts, $14, format="text")
+save(featureMeans, $15, format="text")
+save(featureSTDs, $16, format="text")
+
+# -----------------------------------------------------------------------------------------------------------
+
+def bivar_ss(X:matrix[float], Y:matrix[float]) -> (R:float, covXY:float, sigmaX:float, sigmaY:float):
+ # Unweighted co-variance
+ covXY = cov(X,Y)
+
+ # compute standard deviations for both X and Y by computing 2^nd central moment
+ W = nrow(X)
+ m2X = moment(X,2)
+ m2Y = moment(Y,2)
+ sigmaX = sqrt(m2X * (W/(W-1.0)) )
+ sigmaY = sqrt(m2Y * (W/(W-1.0)) )
+
+ # Pearson's R
+ R = covXY / (sigmaX*sigmaY)
+
+# -----------------------------------------------------------------------------------------------------------
+
+def bivar_cc(A:matrix[float], B:matrix[float]) -> (pval:float, contingencyTable:matrix[float], rowMarginals:matrix[float], colMarginals:matrix[float]):
+ # Contingency Table
+ FF = table(A,B)
+
+ tmp = removeEmpty(target=FF, axis=0)
+ F = removeEmpty(target=tmp, axis=1)
+
+ # Chi-Squared
+ W = sum(F)
+ r = rowSums(F)
+ c = colSums(F)
+ E = (dot(r, c))/W
+ E = ppred(E, 0, "==")*0.0001 + E
+ T = (F-E)**2/E
+ chi_squared = sum(T)
+ # compute p-value
+ degFreedom = (nrow(F)-1)*(ncol(F)-1)
+ pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=False)
+
+ # Assign return values
+ pval = pValue
+ contingencyTable = F
+ rowMarginals = r
+ colMarginals = c
+
+# -----------------------------------------------------------------------------------------------------------
+
+# Y points to SCALE variable
+# A points to CATEGORICAL variable
+def bivar_sc(Y:matrix[float], A:matrix[float]) -> (pVal:float, CFreqs:matrix[float], CMeans:matrix[float], CVars:matrix[float]):
+ # mean and variance in target variable
+ W = nrow(A)
+ my = mean(Y)
+ varY = moment(Y,2) * W/(W-1.0)
+
+ # category-wise (frequencies, means, variances)
+ CFreqs1 = aggregate(target=Y, groups=A, fn="count")
+ present_domain_vals_mat = removeEmpty(target=diag(1-ppred(CFreqs1, 0, "==")), axis=0)
+ CFreqs = dot(present_domain_vals_mat, CFreqs1)
+
+ CMeans = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="mean"))
+ CVars = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="variance"))
+
+ # number of categories
+ R = nrow(CFreqs)
+ df1 = R-1
+ df2 = W-R
+
+ anova_num = sum( (CFreqs*(CMeans-my)**2) )/(R-1)
+ anova_den = sum( (CFreqs-1)*CVars )/(W-R)
+ AnovaF = anova_num/anova_den
+ pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=False)
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R
index dc65b8a..a3ca47a 100644
--- a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R
+++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R
@@ -1,71 +1,71 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-args <- commandArgs(TRUE)
-
-library("Matrix")
-
-D = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
-C = as.matrix(readMM(paste(args[1], "Y.mtx", sep="")))
-
-# reading input args
-numClasses = as.integer(args[2]);
-laplace_correction = as.double(args[3]);
-
-numRows = nrow(D)
-numFeatures = ncol(D)
-
-# Compute conditionals
-
-# Compute the feature counts for each class
-classFeatureCounts = matrix(0, numClasses, numFeatures)
-for (i in 1:numFeatures) {
- Col = D[,i]
- classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2];
-}
-
-# Compute the total feature count for each class
-# and add the number of features to this sum
-# for subsequent regularization (Laplace's rule)
-classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
-
-# Compute class conditional probabilities
-ones = matrix(1, 1, numFeatures)
-repClassSums = classSums %*% ones;
-class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums;
-
-# Compute class priors
-class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2]
-class_prior = class_counts / numRows;
-
-# Compute accuracy on training set
-ones = matrix(1, numRows, 1)
-D_w_ones = cbind(D, ones)
-model = cbind(class_conditionals, class_prior)
-log_probs = D_w_ones %*% t(log(model))
-pred = max.col(log_probs,ties.method="last");
-acc = sum(pred == C) / numRows * 100
-
-print(paste("Training Accuracy (%): ", acc, sep=""))
-
-# write out the model
-writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep=""));
-writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep=""));
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+
+library("Matrix")
+
+D = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
+C = as.matrix(readMM(paste(args[1], "Y.mtx", sep="")))
+
+# reading input args
+numClasses = as.integer(args[2]);
+laplace_correction = as.double(args[3]);
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = matrix(0, numClasses, numFeatures)
+for (i in 1:numFeatures) {
+ Col = D[,i]
+ classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2];
+}
+
+# Compute the total feature count for each class
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+ones = matrix(1, 1, numFeatures)
+repClassSums = classSums %*% ones;
+class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums;
+
+# Compute class priors
+class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2]
+class_prior = class_counts / numRows;
+
+# Compute accuracy on training set
+ones = matrix(1, numRows, 1)
+D_w_ones = cbind(D, ones)
+model = cbind(class_conditionals, class_prior)
+log_probs = D_w_ones %*% t(log(model))
+pred = max.col(log_probs,ties.method="last");
+acc = sum(pred == C) / numRows * 100
+
+print(paste("Training Accuracy (%): ", acc, sep=""))
+
+# write out the model
+writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep=""));
+writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep=""));