You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/26 02:12:43 UTC
[19/55] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml
index d0ce8d1..77df03b 100644
--- a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml
+++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml
@@ -1,78 +1,78 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multinomial naive Bayes classifier with Laplace correction
-#
-# Example Usage:
-# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
-#
-
-# defaults
-# $laplace = 1
-fmt = ifdef($fmt, "text")
-
-# reading input args
-numClasses = $classes
-D = read($X)
-C = read($Y)
-laplace_correction = ifdef($laplace, 1)
-
-numRows = nrow(D)
-numFeatures = ncol(D)
-
-# Compute conditionals
-
-# Compute the feature counts for each class
-classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
-parfor (i in 1:numFeatures) {
- Col = D[,i]
- classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
-}
-
-# Compute the total feature count for each class
-# and add the number of features to this sum
-# for subsequent regularization (Laplace's rule)
-classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
-
-# Compute class conditional probabilities
-ones = matrix(1, rows=1, cols=numFeatures)
-repClassSums = classSums %*% ones
-class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
-
-# Compute class priors
-class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
-class_prior = class_counts / numRows;
-
-# Compute accuracy on training set
-ones = matrix(1, rows=numRows, cols=1)
-D_w_ones = append(D, ones)
-model = append(class_conditionals, class_prior)
-log_probs = D_w_ones %*% t(log(model))
-pred = rowIndexMax(log_probs)
-acc = sum(ppred(pred, C, "==")) / numRows * 100
-
-acc_str = "Training Accuracy (%): " + acc
-print(acc_str)
-write(acc_str, $accuracy)
-
-# write out the model
-write(class_prior, $prior, format=fmt);
-write(class_conditionals, $conditionals, format=fmt);
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multinomial naive Bayes classifier with Laplace correction
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
+#
+
+# defaults
+# $laplace = 1
+fmt = ifdef($fmt, "text")
+
+# reading input args
+numClasses = $classes
+D = read($X)
+C = read($Y)
+laplace_correction = ifdef($laplace, 1)
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
+parfor (i in 1:numFeatures) {
+ Col = D[,i]
+ classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
+}
+
+# Compute the total feature count for each class
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+ones = matrix(1, rows=1, cols=numFeatures)
+repClassSums = classSums %*% ones
+class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
+
+# Compute class priors
+class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
+class_prior = class_counts / numRows;
+
+# Compute accuracy on training set
+ones = matrix(1, rows=numRows, cols=1)
+D_w_ones = append(D, ones)
+model = append(class_conditionals, class_prior)
+log_probs = D_w_ones %*% t(log(model))
+pred = rowIndexMax(log_probs)
+acc = sum(ppred(pred, C, "==")) / numRows * 100
+
+acc_str = "Training Accuracy (%): " + acc
+print(acc_str)
+write(acc_str, $accuracy)
+
+# write out the model
+write(class_prior, $prior, format=fmt);
+write(class_conditionals, $conditionals, format=fmt);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml
index 5d84951..462b330 100644
--- a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml
+++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml
@@ -1,79 +1,79 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multinomial naive Bayes classifier with Laplace correction
-#
-# Example Usage:
-# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
-#
-
-# defaults
-# $laplace = 1
-fmt = ifdef($fmt, "text")
-
-# reading input args
-numClasses = $classes
-D = load($X)
-C = load($Y)
-laplace_correction = ifdef($laplace, 1)
-
-numRows = nrow(D)
-numFeatures = ncol(D)
-
-# Compute conditionals
-
-# Compute the feature counts for each class
-classFeatureCounts = full(0, rows=numClasses, cols=numFeatures)
-parfor (i in 1:numFeatures):
- Col = D[,i]
- classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=numClasses)
-
-# Compute the total feature count for each class
-# and add the number of features to this sum
-# for subsequent regularization (Laplace's rule)
-classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
-
-# Compute class conditional probabilities
-ones = full(1, rows=1, cols=numFeatures)
-repClassSums = dot(classSums, ones)
-class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
-
-# Compute class priors
-class_counts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses)
-class_prior = class_counts / numRows
-
-# Compute accuracy on training set
-ones = full(1, rows=numRows, cols=1)
-D_w_ones = append(D, ones)
-model = append(class_conditionals, class_prior)
-log_model = log(model)
-transpose_log_model = log_model.transpose()
-log_probs = dot(D_w_ones, transpose_log_model)
-pred = rowIndexMax(log_probs)
-acc = sum(ppred(pred, C, "==")) / numRows * 100
-
-acc_str = "Training Accuracy (%): " + acc
-print(acc_str)
-save(acc_str, $accuracy)
-
-# write out the model
-save(class_prior, $prior, format=fmt)
-save(class_conditionals, $conditionals, format=fmt)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multinomial naive Bayes classifier with Laplace correction
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
+#
+
+# defaults
+# $laplace = 1
+fmt = ifdef($fmt, "text")
+
+# reading input args
+numClasses = $classes
+D = load($X)
+C = load($Y)
+laplace_correction = ifdef($laplace, 1)
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = full(0, rows=numClasses, cols=numFeatures)
+parfor (i in 1:numFeatures):
+ Col = D[,i]
+ classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=numClasses)
+
+# Compute the total feature count for each class
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+ones = full(1, rows=1, cols=numFeatures)
+repClassSums = dot(classSums, ones)
+class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
+
+# Compute class priors
+class_counts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses)
+class_prior = class_counts / numRows
+
+# Compute accuracy on training set
+ones = full(1, rows=numRows, cols=1)
+D_w_ones = append(D, ones)
+model = append(class_conditionals, class_prior)
+log_model = log(model)
+transpose_log_model = log_model.transpose()
+log_probs = dot(D_w_ones, transpose_log_model)
+pred = rowIndexMax(log_probs)
+acc = sum(ppred(pred, C, "==")) / numRows * 100
+
+acc_str = "Training Accuracy (%): " + acc
+print(acc_str)
+save(acc_str, $accuracy)
+
+# write out the model
+save(class_prior, $prior, format=fmt)
+save(class_conditionals, $conditionals, format=fmt)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/naive-bayes/naive-bayes.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.R b/src/test/scripts/applications/naive-bayes/naive-bayes.R
index dc65b8a..a3ca47a 100644
--- a/src/test/scripts/applications/naive-bayes/naive-bayes.R
+++ b/src/test/scripts/applications/naive-bayes/naive-bayes.R
@@ -1,71 +1,71 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-args <- commandArgs(TRUE)
-
-library("Matrix")
-
-D = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
-C = as.matrix(readMM(paste(args[1], "Y.mtx", sep="")))
-
-# reading input args
-numClasses = as.integer(args[2]);
-laplace_correction = as.double(args[3]);
-
-numRows = nrow(D)
-numFeatures = ncol(D)
-
-# Compute conditionals
-
-# Compute the feature counts for each class
-classFeatureCounts = matrix(0, numClasses, numFeatures)
-for (i in 1:numFeatures) {
- Col = D[,i]
- classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2];
-}
-
-# Compute the total feature count for each class
-# and add the number of features to this sum
-# for subsequent regularization (Laplace's rule)
-classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
-
-# Compute class conditional probabilities
-ones = matrix(1, 1, numFeatures)
-repClassSums = classSums %*% ones;
-class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums;
-
-# Compute class priors
-class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2]
-class_prior = class_counts / numRows;
-
-# Compute accuracy on training set
-ones = matrix(1, numRows, 1)
-D_w_ones = cbind(D, ones)
-model = cbind(class_conditionals, class_prior)
-log_probs = D_w_ones %*% t(log(model))
-pred = max.col(log_probs,ties.method="last");
-acc = sum(pred == C) / numRows * 100
-
-print(paste("Training Accuracy (%): ", acc, sep=""))
-
-# write out the model
-writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep=""));
-writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep=""));
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+
+library("Matrix")
+
+D = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
+C = as.matrix(readMM(paste(args[1], "Y.mtx", sep="")))
+
+# reading input args
+numClasses = as.integer(args[2]);
+laplace_correction = as.double(args[3]);
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = matrix(0, numClasses, numFeatures)
+for (i in 1:numFeatures) {
+ Col = D[,i]
+ classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2];
+}
+
+# Compute the total feature count for each class
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+ones = matrix(1, 1, numFeatures)
+repClassSums = classSums %*% ones;
+class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums;
+
+# Compute class priors
+class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2]
+class_prior = class_counts / numRows;
+
+# Compute accuracy on training set
+ones = matrix(1, numRows, 1)
+D_w_ones = cbind(D, ones)
+model = cbind(class_conditionals, class_prior)
+log_probs = D_w_ones %*% t(log(model))
+pred = max.col(log_probs,ties.method="last");
+acc = sum(pred == C) / numRows * 100
+
+print(paste("Training Accuracy (%): ", acc, sep=""))
+
+# write out the model
+writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep=""));
+writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep=""));
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/naive-bayes/naive-bayes.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.dml b/src/test/scripts/applications/naive-bayes/naive-bayes.dml
index 1fe1bf4..a81edea 100644
--- a/src/test/scripts/applications/naive-bayes/naive-bayes.dml
+++ b/src/test/scripts/applications/naive-bayes/naive-bayes.dml
@@ -1,67 +1,67 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multinomial naive Bayes classifier with Laplace correction
-#
-# Example Usage:
-# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
-#
-
-# defaults
-# $laplace = 1
-fmt = ifdef($fmt, "text")
-
-# reading input args
-numClasses = $classes
-D = read($X)
-C = read($Y)
-laplaceCorrection = ifdef($laplace, 1)
-
-numRows = nrow(D)
-numFeatures = ncol(D)
-
-# Compute conditionals
-# Compute the feature counts for each class
-classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=as.integer(numClasses));
-
-# Compute the total feature count for each class
-# and add the number of features to this sum
-# for subsequent regularization (Laplace's rule)
-classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection
-
-# Compute class conditional probabilities
-classConditionals = (classFeatureCounts + laplaceCorrection) / classSums
-
-# Compute class priors
-classCounts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
-classPrior = classCounts / numRows;
-
-# Compute accuracy on training set
-logProbs = D %*% t(log(classConditionals)) + t(log(classPrior));
-acc = sum(rowIndexMax(logProbs) == C) / numRows * 100
-
-acc_str = "Training Accuracy (%): " + acc
-print(acc_str)
-write(acc_str, $accuracy)
-
-# write out the model
-write(classPrior, $prior, format=fmt);
-write(classConditionals, $conditionals, format=fmt);
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multinomial naive Bayes classifier with Laplace correction
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
+#
+
+# defaults
+# $laplace = 1
+fmt = ifdef($fmt, "text")
+
+# reading input args
+numClasses = $classes
+D = read($X)
+C = read($Y)
+laplaceCorrection = ifdef($laplace, 1)
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+# Compute the feature counts for each class
+classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=as.integer(numClasses));
+
+# Compute the total feature count for each class
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection
+
+# Compute class conditional probabilities
+classConditionals = (classFeatureCounts + laplaceCorrection) / classSums
+
+# Compute class priors
+classCounts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
+classPrior = classCounts / numRows;
+
+# Compute accuracy on training set
+logProbs = D %*% t(log(classConditionals)) + t(log(classPrior));
+acc = sum(rowIndexMax(logProbs) == C) / numRows * 100
+
+acc_str = "Training Accuracy (%): " + acc
+print(acc_str)
+write(acc_str, $accuracy)
+
+# write out the model
+write(classPrior, $prior, format=fmt);
+write(classConditionals, $conditionals, format=fmt);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.pydml b/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
index 25fbf84..480265d 100644
--- a/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
+++ b/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
@@ -1,71 +1,71 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multinomial naive Bayes classifier with Laplace correction
-#
-# Example Usage:
-# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
-#
-
-# defaults
-# $laplace = 1
-fmt = ifdef($fmt, "text")
-
-# reading input args
-numClasses = $classes
-D = load($X)
-C = load($Y)
-laplaceCorrection = ifdef($laplace, 1)
-
-numRows = nrow(D)
-numFeatures = ncol(D)
-
-# Compute conditionals
-# Compute the feature counts for each class
-classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=numClasses);
-
-# Compute the total feature count for each class
-# and add the number of features to this sum
-# for subsequent regularization (Laplace's rule)
-classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection
-
-# Compute class conditional probabilities
-classConditionals = (classFeatureCounts + laplaceCorrection) / classSums
-
-# Compute class priors
-classCounts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses)
-classPrior = classCounts / numRows
-
-# Compute accuracy on training set
-lmodel1 = log(classConditionals)
-lmodel2 = log(classPrior)
-tlmodel1 = lmodel1.transpose()
-tlmodel2 = lmodel2.transpose()
-logProbs = dot(D, tlmodel1) + tlmodel2
-acc = sum(rowIndexMax(logProbs) == C) / numRows * 100
-
-acc_str = "Training Accuracy (%): " + acc
-print(acc_str)
-save(acc_str, $accuracy)
-
-# write out the model
-save(classPrior, $prior, format=fmt)
-save(classConditionals, $conditionals, format=fmt)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multinomial naive Bayes classifier with Laplace correction
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
+#
+
+# defaults
+# $laplace = 1
+fmt = ifdef($fmt, "text")
+
+# reading input args
+numClasses = $classes
+D = load($X)
+C = load($Y)
+laplaceCorrection = ifdef($laplace, 1)
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+# Compute the feature counts for each class
+classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=numClasses);
+
+# Compute the total feature count for each class
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection
+
+# Compute class conditional probabilities
+classConditionals = (classFeatureCounts + laplaceCorrection) / classSums
+
+# Compute class priors
+classCounts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses)
+classPrior = classCounts / numRows
+
+# Compute accuracy on training set
+lmodel1 = log(classConditionals)
+lmodel2 = log(classPrior)
+tlmodel1 = lmodel1.transpose()
+tlmodel2 = lmodel2.transpose()
+logProbs = dot(D, tlmodel1) + tlmodel2
+acc = sum(rowIndexMax(logProbs) == C) / numRows * 100
+
+acc_str = "Training Accuracy (%): " + acc
+print(acc_str)
+save(acc_str, $accuracy)
+
+# write out the model
+save(classPrior, $prior, format=fmt)
+save(classConditionals, $conditionals, format=fmt)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/parfor/parfor_bivariate.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/parfor/parfor_bivariate.R b/src/test/scripts/applications/parfor/parfor_bivariate.R
index 889b22b..82a5be5 100644
--- a/src/test/scripts/applications/parfor/parfor_bivariate.R
+++ b/src/test/scripts/applications/parfor/parfor_bivariate.R
@@ -1,155 +1,155 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-args <- commandArgs(TRUE)
-options(digits=22)
-
-library("Matrix")
-
-
-D1 <- readMM(paste(args[1], "D.mtx", sep=""))
-S11 <- readMM(paste(args[1], "S1.mtx", sep=""))
-S21 <- readMM(paste(args[1], "S2.mtx", sep=""))
-K11 <- readMM(paste(args[1], "K1.mtx", sep=""))
-K21 <- readMM(paste(args[1], "K2.mtx", sep=""))
-D <- as.matrix(D1);
-S1 <- as.matrix(S11);
-S2 <- as.matrix(S21);
-K1 <- as.matrix(K11);
-K2 <- as.matrix(K21);
-
-numPairs <- ncol(S1) * ncol(S2); # number of attribute pairs (|S1|*|S2|)
-maxC <- args[2]; # max number of categories in any categorical attribute
-
-s1size <- ncol(S1);
-s2size <- ncol(S2);
-
-# R, chisq, cramers, spearman, eta, anovaf
-numstats <- 8;
-basestats <- array(0,dim=c(numstats,numPairs));
-cat_counts <- array(0,dim=c(maxC,numPairs));
-cat_means <- array(0,dim=c(maxC,numPairs));
-cat_vars <- array(0,dim=c(maxC,numPairs));
-
-
-for( i in 1:s1size ) {
- a1 <- S1[,i];
- k1 <- K1[1,i];
- A1 <- as.matrix(D[,a1]);
-
- for( j in 1:s2size ) {
- pairID <-(i-1)*s2size+j;
- a2 <- S2[,j];
- k2 <- K2[1,j];
- A2 <- as.matrix(D[,a2]);
-
- if (k1 == k2) {
- if (k1 == 1) {
- # scale-scale
- print("scale-scale");
- basestats[1,pairID] <- cor(D[,a1], D[,a2]);
- #basestats[1,pairID] <- cor(A1, A2);
-
- print(basestats[1,pairID]);
- } else {
- # nominal-nominal or ordinal-ordinal
- print("categorical-categorical");
- F <- table(A1,A2);
- cst <- chisq.test(F);
- chi_squared <- as.numeric(cst[1]);
- degFreedom <- (nrow(F)-1)*(ncol(F)-1);
- pValue <- as.numeric(cst[3]);
- q <- min(dim(F));
- W <- sum(F);
- cramers_v <- sqrt(chi_squared/(W*(q-1)));
-
- basestats[2,pairID] <- chi_squared;
- basestats[3,pairID] <- degFreedom;
- basestats[4,pairID] <- pValue;
- basestats[5,pairID] <- cramers_v;
-
- if ( k1 == 3 ) {
- # ordinal-ordinal
- print("ordinal-ordinal");
- basestats[6,pairID] <- cor(A1,A2, method="spearman");
- }
- }
- }
- else {
- if (k1 == 1 || k2 == 1) {
- # Scale-nominal/ordinal
- print("scale-categorical");
- if ( k1 == 1 ) {
- Av <- as.matrix(A2);
- Yv <- as.matrix(A1);
- }
- else {
- Av <- as.matrix(A1);
- Yv <- as.matrix(A2);
- }
-
- W <- nrow(Av);
- my <- mean(Yv);
- varY <- var(Yv);
-
- CFreqs <- as.matrix(table(Av));
- CMeans <- as.matrix(aggregate(Yv, by=list(Av), "mean")$V1);
- CVars <- as.matrix(aggregate(Yv, by=list(Av), "var")$V1);
- R <- nrow(CFreqs);
-
- Eta <- sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
- anova_num <- sum( (CFreqs*(CMeans-my)^2) )/(R-1);
- anova_den <- sum( (CFreqs-1)*CVars )/(W-R);
- ANOVAF <- anova_num/anova_den;
-
- basestats[7,pairID] <- Eta;
- basestats[8,pairID] <- ANOVAF;
-
- cat_counts[ 1:length(CFreqs),pairID] <- CFreqs;
- cat_means[ 1:length(CMeans),pairID] <- CMeans;
- cat_vars[ 1:length(CVars),pairID] <- CVars;
- }
- else {
- # nominal-ordinal or ordinal-nominal
- print("nomial-ordinal"); #TODO should not be same code
- F <- table(A1,A2);
- cst <- chisq.test(F);
- chi_squared <- as.numeric(cst[1]);
- degFreedom <- (nrow(F)-1)*(ncol(F)-1);
- pValue <- as.numeric(cst[3]);
- q <- min(dim(F));
- W <- sum(F);
- cramers_v <- sqrt(chi_squared/(W*(q-1)));
-
- basestats[2,pairID] <- chi_squared;
- basestats[3,pairID] <- degFreedom;
- basestats[4,pairID] <- pValue;
- basestats[5,pairID] <- cramers_v;
- }
- }
- }
-}
-
-writeMM(as(basestats, "CsparseMatrix"), paste(args[3], "bivar.stats", sep=""));
-writeMM(as(cat_counts, "CsparseMatrix"), paste(args[3], "category.counts", sep=""));
-writeMM(as(cat_means, "CsparseMatrix"), paste(args[3], "category.means", sep=""));
-writeMM(as(cat_vars, "CsparseMatrix"), paste(args[3], "category.variances", sep=""));
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+
+D1 <- readMM(paste(args[1], "D.mtx", sep=""))
+S11 <- readMM(paste(args[1], "S1.mtx", sep=""))
+S21 <- readMM(paste(args[1], "S2.mtx", sep=""))
+K11 <- readMM(paste(args[1], "K1.mtx", sep=""))
+K21 <- readMM(paste(args[1], "K2.mtx", sep=""))
+D <- as.matrix(D1);
+S1 <- as.matrix(S11);
+S2 <- as.matrix(S21);
+K1 <- as.matrix(K11);
+K2 <- as.matrix(K21);
+
+numPairs <- ncol(S1) * ncol(S2); # number of attribute pairs (|S1|*|S2|)
+maxC <- args[2]; # max number of categories in any categorical attribute
+
+s1size <- ncol(S1);
+s2size <- ncol(S2);
+
+# R, chisq, cramers, spearman, eta, anovaf
+numstats <- 8;
+basestats <- array(0,dim=c(numstats,numPairs));
+cat_counts <- array(0,dim=c(maxC,numPairs));
+cat_means <- array(0,dim=c(maxC,numPairs));
+cat_vars <- array(0,dim=c(maxC,numPairs));
+
+
+for( i in 1:s1size ) {
+ a1 <- S1[,i];
+ k1 <- K1[1,i];
+ A1 <- as.matrix(D[,a1]);
+
+ for( j in 1:s2size ) {
+ pairID <-(i-1)*s2size+j;
+ a2 <- S2[,j];
+ k2 <- K2[1,j];
+ A2 <- as.matrix(D[,a2]);
+
+ if (k1 == k2) {
+ if (k1 == 1) {
+ # scale-scale
+ print("scale-scale");
+ basestats[1,pairID] <- cor(D[,a1], D[,a2]);
+ #basestats[1,pairID] <- cor(A1, A2);
+
+ print(basestats[1,pairID]);
+ } else {
+ # nominal-nominal or ordinal-ordinal
+ print("categorical-categorical");
+ F <- table(A1,A2);
+ cst <- chisq.test(F);
+ chi_squared <- as.numeric(cst[1]);
+ degFreedom <- (nrow(F)-1)*(ncol(F)-1);
+ pValue <- as.numeric(cst[3]);
+ q <- min(dim(F));
+ W <- sum(F);
+ cramers_v <- sqrt(chi_squared/(W*(q-1)));
+
+ basestats[2,pairID] <- chi_squared;
+ basestats[3,pairID] <- degFreedom;
+ basestats[4,pairID] <- pValue;
+ basestats[5,pairID] <- cramers_v;
+
+ if ( k1 == 3 ) {
+ # ordinal-ordinal
+ print("ordinal-ordinal");
+ basestats[6,pairID] <- cor(A1,A2, method="spearman");
+ }
+ }
+ }
+ else {
+ if (k1 == 1 || k2 == 1) {
+ # Scale-nominal/ordinal
+ print("scale-categorical");
+ if ( k1 == 1 ) {
+ Av <- as.matrix(A2);
+ Yv <- as.matrix(A1);
+ }
+ else {
+ Av <- as.matrix(A1);
+ Yv <- as.matrix(A2);
+ }
+
+ W <- nrow(Av);
+ my <- mean(Yv);
+ varY <- var(Yv);
+
+ CFreqs <- as.matrix(table(Av));
+ CMeans <- as.matrix(aggregate(Yv, by=list(Av), "mean")$V1);
+ CVars <- as.matrix(aggregate(Yv, by=list(Av), "var")$V1);
+ R <- nrow(CFreqs);
+
+ Eta <- sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
+ anova_num <- sum( (CFreqs*(CMeans-my)^2) )/(R-1);
+ anova_den <- sum( (CFreqs-1)*CVars )/(W-R);
+ ANOVAF <- anova_num/anova_den;
+
+ basestats[7,pairID] <- Eta;
+ basestats[8,pairID] <- ANOVAF;
+
+ cat_counts[ 1:length(CFreqs),pairID] <- CFreqs;
+ cat_means[ 1:length(CMeans),pairID] <- CMeans;
+ cat_vars[ 1:length(CVars),pairID] <- CVars;
+ }
+ else {
+ # nominal-ordinal or ordinal-nominal
+ print("nomial-ordinal"); #TODO should not be same code
+ F <- table(A1,A2);
+ cst <- chisq.test(F);
+ chi_squared <- as.numeric(cst[1]);
+ degFreedom <- (nrow(F)-1)*(ncol(F)-1);
+ pValue <- as.numeric(cst[3]);
+ q <- min(dim(F));
+ W <- sum(F);
+ cramers_v <- sqrt(chi_squared/(W*(q-1)));
+
+ basestats[2,pairID] <- chi_squared;
+ basestats[3,pairID] <- degFreedom;
+ basestats[4,pairID] <- pValue;
+ basestats[5,pairID] <- cramers_v;
+ }
+ }
+ }
+}
+
+writeMM(as(basestats, "CsparseMatrix"), paste(args[3], "bivar.stats", sep=""));
+writeMM(as(cat_counts, "CsparseMatrix"), paste(args[3], "category.counts", sep=""));
+writeMM(as(cat_means, "CsparseMatrix"), paste(args[3], "category.means", sep=""));
+writeMM(as(cat_vars, "CsparseMatrix"), paste(args[3], "category.variances", sep=""));
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/parfor/parfor_bivariate0.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/parfor/parfor_bivariate0.dml b/src/test/scripts/applications/parfor/parfor_bivariate0.dml
index ac79abd..341a0b1 100644
--- a/src/test/scripts/applications/parfor/parfor_bivariate0.dml
+++ b/src/test/scripts/applications/parfor/parfor_bivariate0.dml
@@ -1,265 +1,265 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-
-/*
- *
- * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs
- * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n}
- * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
- *
- * Seven inputs:
- * $1) D - input data
- * $2) S1 - First attribute set {A_11, A_12, ... A_1m}
- * $3) S2 - Second attribute set {A_21, A_22, ... A_2n}
- * $4) K1 - kind for attributes in S1
- * $5) K2 - kind for attributes in S2
- * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
- * $6) numPairs - total number of pairs (m*n)
- * $7) maxC - maximum number of categories in any categorical attribute
- *
- * One output:
- * $6) output directory in which following four statistics files are created
- * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs
- * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F)
- * + categorical.counts -
- * + categorical.means -
- * + categorical.variances -
- * -> Values in these three matrices are applicable only for scale-categorical attribute pairs.
- * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k.
- */
-
-D = read($1, rows=$7, cols=$8); # input data set
-S1 = read($2, rows=1, cols=$9); # attribute set 1
-S2 = read($3, rows=1, cols=$9); # attribute set 2
-K1 = read($4, rows=1, cols=$9); # kind for attributes in S1
-K2 = read($5, rows=1, cols=$9); # kind for attributes in S2
-numPairs = $10; # number of attribute pairs (|S1|*|S2|)
-maxC = $11; # max number of categories in any categorical attribute
-
-s1size = ncol(S1);
-s2size = ncol(S2);
-
-# R, chisq, cramers, spearman, eta, anovaf
-numstats = 8;
-basestats = matrix(0, rows=numstats, cols=numPairs);
-cat_counts = matrix(0, rows=maxC, cols=numPairs);
-cat_means = matrix(0, rows=maxC, cols=numPairs);
-cat_vars = matrix(0, rows=maxC, cols=numPairs);
-
-
-for( i in 1:s1size ) {
- a1 = castAsScalar(S1[,i]);
- k1 = castAsScalar(K1[1,i]);
- A1 = D[,a1];
- #print("a1="+a1);
-
- for( j in 1:s2size ) {
- pairID = (i-1)*s2size+j;
- #print("ID="+pairID+"(i="+i+",j="+j+")");
- a2 = castAsScalar(S2[,j]);
- k2 = castAsScalar(K2[1,j]);
- A2 = D[,a2];
- #print("a2="+a2);
-
- if (k1 == k2) {
- if (k1 == 1) {
- # scale-scale
- print("[" + i + "," + j + "] scale-scale");
- r = bivar_ss(A1,A2);
- basestats[1,pairID] = r;
- #print("scale:"+r);
- } else {
- # nominal-nominal or ordinal-ordinal
- print("[" + i + "," + j + "] categorical-categorical");
- [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
- basestats[2,pairID] = chisq;
- basestats[3,pairID] = df;
- basestats[4,pairID] = pval;
- basestats[5,pairID] = cramersv;
-
- if ( k1 == 3 ) {
- # ordinal-ordinal
- print("[" + i + "," + j + "] ordinal-ordinal");
- sp = bivar_oo(A1, A2);
- basestats[6,pairID] = sp;
- }
- }
- }
- else {
- if (k1 == 1 | k2 == 1) {
- # Scale-nominal/ordinal TODO MB correctness errors
- print("[" + i + "," + j + "] scale-categorical");
-
- if ( k1 == 1 ) {
- [eta,f, counts, means, vars] = bivar_sc(A1,A2);
- }
- else {
- [eta,f, counts, means, vars] = bivar_sc(A2,A1);
- }
- basestats[7,pairID] = eta;
- basestats[8,pairID] = f;
- cat_counts[,pairID] = counts;
- cat_means[,pairID] = means;
- cat_vars[,pairID] = vars;
- }
- else {
- # nominal-ordinal or ordinal-nominal
- print("[" + i + "," + j + "] categorical-categorical");
- [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
- basestats[2,pairID] = chisq;
- basestats[3,pairID] = df;
- basestats[4,pairID] = pval;
- basestats[5,pairID] = cramersv;
- }
- }
- }
-}
-
-write(basestats, $6 + "/bivar.stats");
-write(cat_counts, $6 + "/category.counts");
-write(cat_means, $6 + "/category.means");
-write(cat_vars, $6 + "/category.variances");
-
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) {
-
- # Contingency Table
- F = table(A,B);
-
- # Chi-Squared
- W = sum(F);
- r = rowSums(F);
- c = colSums(F);
- E = (r %*% c)/W;
- T = (F-E)^2/E;
- chi_squared = sum(T);
-
- # compute p-value
- degFreedom = (nrow(F)-1)*(ncol(F)-1);
- pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
- # Cramer's V
- R = nrow(F);
- C = ncol(F);
- q = min(R,C);
- cramers_v = sqrt(chi_squared/(W*(q-1)));
-
- # Assign return values
- chisq = chi_squared;
- df = as.double(degFreedom);
- pval = pValue;
- cramersv = cramers_v;
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) {
-
- # Unweighted co-variance
- covXY = cov(X,Y);
-
- # compute standard deviations for both X and Y by computing 2^nd central moment
- W = nrow(X);
- m2X = moment(X,2);
- m2Y = moment(Y,2);
- sigmaX = sqrt(m2X * (W/(W-1.0)) );
- sigmaY = sqrt(m2Y * (W/(W-1.0)) );
-
- # Pearson's R
- R = covXY / (sigmaX*sigmaY);
- #print("Rmx="+m2X);
- #print("Rmy="+m2Y);
- #print("Rcov="+covXY);
- #print("Rsx="+sigmaX);
- #print("Rsy="+sigmaY);
- #print("R="+R);
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-# Y points to SCALE variable
-# A points to CATEGORICAL variable
-bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) {
-
- # mean and variance in target variable
- W = nrow(A);
- my = mean(Y);
- varY = moment(Y,2) * W/(W-1.0)
-
- # category-wise (frequencies, means, variances)
- CFreqs = aggregate(target=Y, groups=A, fn="count");
- CMeans = aggregate(target=Y, groups=A, fn="mean");
- CVars = aggregate(target=Y, groups=A, fn="variance");
-
- # number of categories
- R = nrow(CFreqs);
-
- Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
-
- anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1);
- anova_den = sum( (CFreqs-1)*CVars )/(W-R);
- AnovaF = anova_num/anova_den;
-}
-
-# -----------------------------------------------------------------------------------------------------------
-# Function to compute ranks
-# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
-computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
- Ranks = cumsum(X) - X/2 + 1/2;
-}
-
-#-------------------------------------------------------------------------
-
-bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) {
-
- # compute contingency table
- F = table(A,B);
-
- catA = nrow(F); # number of categories in A
- catB = ncol(F); # number of categories in B
-
- # compute category-wise counts for both the attributes
- R = rowSums(F);
- S = colSums(F);
-
- # compute scores, both are column vectors
- [C] = computeRanks(R);
- meanX = mean(C,R);
-
- columnS = t(S);
- [D] = computeRanks(columnS);
-
- # scores (C,D) are individual values, and counts (R,S) act as weights
- meanY = mean(D,columnS);
-
- W = sum(F); # total weight, or total #cases
- varX = moment(C,R,2)*(W/(W-1.0));
- varY = moment(D,columnS,2)*(W/(W-1.0));
- covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
-
- sp = covXY/(sqrt(varX)*sqrt(varY));
-}
-
-# -----------------------------------------------------------------------------------------------------------
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*
+ *
+ * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs
+ * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n}
+ * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
+ *
+ * Seven inputs:
+ * $1) D - input data
+ * $2) S1 - First attribute set {A_11, A_12, ... A_1m}
+ * $3) S2 - Second attribute set {A_21, A_22, ... A_2n}
+ * $4) K1 - kind for attributes in S1
+ * $5) K2 - kind for attributes in S2
+ * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
+ * $6) numPairs - total number of pairs (m*n)
+ * $7) maxC - maximum number of categories in any categorical attribute
+ *
+ * One output:
+ * $6) output directory in which following four statistics files are created
+ * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs
+ * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F)
+ * + categorical.counts -
+ * + categorical.means -
+ * + categorical.variances -
+ * -> Values in these three matrices are applicable only for scale-categorical attribute pairs.
+ * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k.
+ */
+
+D = read($1, rows=$7, cols=$8); # input data set
+S1 = read($2, rows=1, cols=$9); # attribute set 1
+S2 = read($3, rows=1, cols=$9); # attribute set 2
+K1 = read($4, rows=1, cols=$9); # kind for attributes in S1
+K2 = read($5, rows=1, cols=$9); # kind for attributes in S2
+numPairs = $10; # number of attribute pairs (|S1|*|S2|)
+maxC = $11; # max number of categories in any categorical attribute
+
+s1size = ncol(S1);
+s2size = ncol(S2);
+
+# R, chisq, cramers, spearman, eta, anovaf
+numstats = 8;
+basestats = matrix(0, rows=numstats, cols=numPairs);
+cat_counts = matrix(0, rows=maxC, cols=numPairs);
+cat_means = matrix(0, rows=maxC, cols=numPairs);
+cat_vars = matrix(0, rows=maxC, cols=numPairs);
+
+
+for( i in 1:s1size ) {
+ a1 = castAsScalar(S1[,i]);
+ k1 = castAsScalar(K1[1,i]);
+ A1 = D[,a1];
+ #print("a1="+a1);
+
+ for( j in 1:s2size ) {
+ pairID = (i-1)*s2size+j;
+ #print("ID="+pairID+"(i="+i+",j="+j+")");
+ a2 = castAsScalar(S2[,j]);
+ k2 = castAsScalar(K2[1,j]);
+ A2 = D[,a2];
+ #print("a2="+a2);
+
+ if (k1 == k2) {
+ if (k1 == 1) {
+ # scale-scale
+ print("[" + i + "," + j + "] scale-scale");
+ r = bivar_ss(A1,A2);
+ basestats[1,pairID] = r;
+ #print("scale:"+r);
+ } else {
+ # nominal-nominal or ordinal-ordinal
+ print("[" + i + "," + j + "] categorical-categorical");
+ [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
+ basestats[2,pairID] = chisq;
+ basestats[3,pairID] = df;
+ basestats[4,pairID] = pval;
+ basestats[5,pairID] = cramersv;
+
+ if ( k1 == 3 ) {
+ # ordinal-ordinal
+ print("[" + i + "," + j + "] ordinal-ordinal");
+ sp = bivar_oo(A1, A2);
+ basestats[6,pairID] = sp;
+ }
+ }
+ }
+ else {
+ if (k1 == 1 | k2 == 1) {
+ # Scale-nominal/ordinal TODO MB correctness errors
+ print("[" + i + "," + j + "] scale-categorical");
+
+ if ( k1 == 1 ) {
+ [eta,f, counts, means, vars] = bivar_sc(A1,A2);
+ }
+ else {
+ [eta,f, counts, means, vars] = bivar_sc(A2,A1);
+ }
+ basestats[7,pairID] = eta;
+ basestats[8,pairID] = f;
+ cat_counts[,pairID] = counts;
+ cat_means[,pairID] = means;
+ cat_vars[,pairID] = vars;
+ }
+ else {
+ # nominal-ordinal or ordinal-nominal
+ print("[" + i + "," + j + "] categorical-categorical");
+ [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
+ basestats[2,pairID] = chisq;
+ basestats[3,pairID] = df;
+ basestats[4,pairID] = pval;
+ basestats[5,pairID] = cramersv;
+ }
+ }
+ }
+}
+
+write(basestats, $6 + "/bivar.stats");
+write(cat_counts, $6 + "/category.counts");
+write(cat_means, $6 + "/category.means");
+write(cat_vars, $6 + "/category.variances");
+
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) {
+
+ # Contingency Table
+ F = table(A,B);
+
+ # Chi-Squared
+ W = sum(F);
+ r = rowSums(F);
+ c = colSums(F);
+ E = (r %*% c)/W;
+ T = (F-E)^2/E;
+ chi_squared = sum(T);
+
+ # compute p-value
+ degFreedom = (nrow(F)-1)*(ncol(F)-1);
+ pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+ # Cramer's V
+ R = nrow(F);
+ C = ncol(F);
+ q = min(R,C);
+ cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+ # Assign return values
+ chisq = chi_squared;
+ df = as.double(degFreedom);
+ pval = pValue;
+ cramersv = cramers_v;
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) {
+
+ # Unweighted co-variance
+ covXY = cov(X,Y);
+
+ # compute standard deviations for both X and Y by computing 2^nd central moment
+ W = nrow(X);
+ m2X = moment(X,2);
+ m2Y = moment(Y,2);
+ sigmaX = sqrt(m2X * (W/(W-1.0)) );
+ sigmaY = sqrt(m2Y * (W/(W-1.0)) );
+
+ # Pearson's R
+ R = covXY / (sigmaX*sigmaY);
+ #print("Rmx="+m2X);
+ #print("Rmy="+m2Y);
+ #print("Rcov="+covXY);
+ #print("Rsx="+sigmaX);
+ #print("Rsy="+sigmaY);
+ #print("R="+R);
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+# Y points to SCALE variable
+# A points to CATEGORICAL variable
+bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) {
+
+ # mean and variance in target variable
+ W = nrow(A);
+ my = mean(Y);
+ varY = moment(Y,2) * W/(W-1.0)
+
+ # category-wise (frequencies, means, variances)
+ CFreqs = aggregate(target=Y, groups=A, fn="count");
+ CMeans = aggregate(target=Y, groups=A, fn="mean");
+ CVars = aggregate(target=Y, groups=A, fn="variance");
+
+ # number of categories
+ R = nrow(CFreqs);
+
+ Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
+
+ anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1);
+ anova_den = sum( (CFreqs-1)*CVars )/(W-R);
+ AnovaF = anova_num/anova_den;
+}
+
+# -----------------------------------------------------------------------------------------------------------
+# Function to compute ranks
+# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
+computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
+ Ranks = cumsum(X) - X/2 + 1/2;
+}
+
+#-------------------------------------------------------------------------
+
+bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) {
+
+ # compute contingency table
+ F = table(A,B);
+
+ catA = nrow(F); # number of categories in A
+ catB = ncol(F); # number of categories in B
+
+ # compute category-wise counts for both the attributes
+ R = rowSums(F);
+ S = colSums(F);
+
+ # compute scores, both are column vectors
+ [C] = computeRanks(R);
+ meanX = mean(C,R);
+
+ columnS = t(S);
+ [D] = computeRanks(columnS);
+
+ # scores (C,D) are individual values, and counts (R,S) act as weights
+ meanY = mean(D,columnS);
+
+ W = sum(F); # total weight, or total #cases
+ varX = moment(C,R,2)*(W/(W-1.0));
+ varY = moment(D,columnS,2)*(W/(W-1.0));
+ covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
+
+ sp = covXY/(sqrt(varX)*sqrt(varY));
+}
+
+# -----------------------------------------------------------------------------------------------------------
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/parfor/parfor_bivariate1.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/parfor/parfor_bivariate1.dml b/src/test/scripts/applications/parfor/parfor_bivariate1.dml
index 69a7ad6..aa40d7c 100644
--- a/src/test/scripts/applications/parfor/parfor_bivariate1.dml
+++ b/src/test/scripts/applications/parfor/parfor_bivariate1.dml
@@ -1,256 +1,256 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-
-/*
- *
- * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs
- * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n}
- * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
- *
- * Seven inputs:
- * $1) D - input data
- * $2) S1 - First attribute set {A_11, A_12, ... A_1m}
- * $3) S2 - Second attribute set {A_21, A_22, ... A_2n}
- * $4) K1 - kind for attributes in S1
- * $5) K2 - kind for attributes in S2
- * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
- * $6) numPairs - total number of pairs (m*n)
- * $7) maxC - maximum number of categories in any categorical attribute
- *
- * One output:
- * $6) output directory in which following four statistics files are created
- * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs
- * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F)
- * + categorical.counts -
- * + categorical.means -
- * + categorical.variances -
- * -> Values in these three matrices are applicable only for scale-categorical attribute pairs.
- * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k.
- */
-
-D = read($1, rows=$7, cols=$8); # input data set
-S1 = read($2, rows=1, cols=$9); # attribute set 1
-S2 = read($3, rows=1, cols=$9); # attribute set 2
-K1 = read($4, rows=1, cols=$9); # kind for attributes in S1
-K2 = read($5, rows=1, cols=$9); # kind for attributes in S2
-numPairs = $10; # number of attribute pairs (|S1|*|S2|)
-maxC = $11; # max number of categories in any categorical attribute
-
-s1size = ncol(S1);
-s2size = ncol(S2);
-
-# R, chisq, cramers, spearman, eta, anovaf
-numstats = 8;
-basestats = matrix(0, rows=numstats, cols=numPairs);
-cat_counts = matrix(0, rows=maxC, cols=numPairs);
-cat_means = matrix(0, rows=maxC, cols=numPairs);
-cat_vars = matrix(0, rows=maxC, cols=numPairs);
-
-
-parfor( i in 1:s1size, par=4, mode=LOCAL, check=0, opt=NONE) {
- a1 = castAsScalar(S1[,i]);
- k1 = castAsScalar(K1[1,i]);
- A1 = D[,a1];
-
- parfor( j in 1:s2size, par=4, mode=LOCAL, check=0, opt=NONE) {
- pairID = (i-1)*s2size+j;
- a2 = castAsScalar(S2[,j]);
- k2 = castAsScalar(K2[1,j]);
- A2 = D[,a2];
-
- if (k1 == k2) {
- if (k1 == 1) {
- # scale-scale
- print("[" + i + "," + j + "] scale-scale");
- r = bivar_ss(A1,A2);
- basestats[1,pairID] = r;
- } else {
- # nominal-nominal or ordinal-ordinal
- print("[" + i + "," + j + "] categorical-categorical");
- [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
- basestats[2,pairID] = chisq;
- basestats[3,pairID] = df;
- basestats[4,pairID] = pval;
- basestats[5,pairID] = cramersv;
-
- if ( k1 == 3 ) {
- # ordinal-ordinal
- print("[" + i + "," + j + "] ordinal-ordinal");
- sp = bivar_oo(A1, A2);
- basestats[6,pairID] = sp;
- }
- }
- }
- else {
- if (k1 == 1 | k2 == 1) {
- # Scale-nominal/ordinal TODO MB correctness errors
- print("[" + i + "," + j + "] scale-categorical");
-
- if ( k1 == 1 ) {
- [eta,f, counts, means, vars] = bivar_sc(A1,A2);
- }
- else {
- [eta,f, counts, means, vars] = bivar_sc(A2,A1);
- }
- basestats[7,pairID] = eta;
- basestats[8,pairID] = f;
- cat_counts[,pairID] = counts;
- cat_means[,pairID] = means;
- cat_vars[,pairID] = vars;
- }
- else {
- # nominal-ordinal or ordinal-nominal
- print("[" + i + "," + j + "] categorical-categorical");
- [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
- basestats[2,pairID] = chisq;
- basestats[3,pairID] = df;
- basestats[4,pairID] = pval;
- basestats[5,pairID] = cramersv;
- }
- }
- }
-}
-
-write(basestats, $6 + "/bivar.stats");
-write(cat_counts, $6 + "/category.counts");
-write(cat_means, $6 + "/category.means");
-write(cat_vars, $6 + "/category.variances");
-
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) {
-
- # Contingency Table
- F = table(A,B);
-
- # Chi-Squared
- W = sum(F);
- r = rowSums(F);
- c = colSums(F);
- E = (r %*% c)/W;
- T = (F-E)^2/E;
- chi_squared = sum(T);
-
- # compute p-value
- degFreedom = (nrow(F)-1)*(ncol(F)-1);
- pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
-
- # Cramer's V
- R = nrow(F);
- C = ncol(F);
- q = min(R,C);
- cramers_v = sqrt(chi_squared/(W*(q-1)));
-
- # Assign return values
- chisq = chi_squared;
- df = as.double(degFreedom);
- pval = pValue;
- cramersv = cramers_v;
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) {
-
- # Unweighted co-variance
- covXY = cov(X,Y);
-
- # compute standard deviations for both X and Y by computing 2^nd central moment
- W = nrow(X);
- m2X = moment(X,2);
- m2Y = moment(Y,2);
- sigmaX = sqrt(m2X * (W/(W-1.0)) );
- sigmaY = sqrt(m2Y * (W/(W-1.0)) );
-
- # Pearson's R
- R = covXY / (sigmaX*sigmaY);
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-# Y points to SCALE variable
-# A points to CATEGORICAL variable
-bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) {
-
- # mean and variance in target variable
- W = nrow(A);
- my = mean(Y);
- varY = moment(Y,2) * W/(W-1.0)
-
- # category-wise (frequencies, means, variances)
- CFreqs = aggregate(target=Y, groups=A, fn="count");
- CMeans = aggregate(target=Y, groups=A, fn="mean");
- CVars = aggregate(target=Y, groups=A, fn="variance");
-
- # number of categories
- R = nrow(CFreqs);
-
- Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
-
- anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1);
- anova_den = sum( (CFreqs-1)*CVars )/(W-R);
- AnovaF = anova_num/anova_den;
-}
-
-# -----------------------------------------------------------------------------------------------------------
-# Function to compute ranks
-# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
-computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
- Ranks = cumsum(X) - X/2 + 1/2;
-}
-
-#-------------------------------------------------------------------------
-
-bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) {
-
- # compute contingency table
- F = table(A,B);
-
- catA = nrow(F); # number of categories in A
- catB = ncol(F); # number of categories in B
-
- # compute category-wise counts for both the attributes
- R = rowSums(F);
- S = colSums(F);
-
- # compute scores, both are column vectors
- [C] = computeRanks(R);
- meanX = mean(C,R);
-
- columnS = t(S);
- [D] = computeRanks(columnS);
-
- # scores (C,D) are individual values, and counts (R,S) act as weights
- meanY = mean(D,columnS);
-
- W = sum(F); # total weight, or total #cases
- varX = moment(C,R,2)*(W/(W-1.0));
- varY = moment(D,columnS,2)*(W/(W-1.0));
- covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
-
- sp = covXY/(sqrt(varX)*sqrt(varY));
-}
-
-# -----------------------------------------------------------------------------------------------------------
-
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*
+ *
+ * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs
+ * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n}
+ * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n)
+ *
+ * Seven inputs:
+ * $1) D - input data
+ * $2) S1 - First attribute set {A_11, A_12, ... A_1m}
+ * $3) S2 - Second attribute set {A_21, A_22, ... A_2n}
+ * $4) K1 - kind for attributes in S1
+ * $5) K2 - kind for attributes in S2
+ * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal
+ * $6) numPairs - total number of pairs (m*n)
+ * $7) maxC - maximum number of categories in any categorical attribute
+ *
+ * One output:
+ * $6) output directory in which following four statistics files are created
+ * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs
+ * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F)
+ * + categorical.counts -
+ * + categorical.means -
+ * + categorical.variances -
+ * -> Values in these three matrices are applicable only for scale-categorical attribute pairs.
+ * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k.
+ */
+
+D = read($1, rows=$7, cols=$8); # input data set
+S1 = read($2, rows=1, cols=$9); # attribute set 1
+S2 = read($3, rows=1, cols=$9); # attribute set 2
+K1 = read($4, rows=1, cols=$9); # kind for attributes in S1
+K2 = read($5, rows=1, cols=$9); # kind for attributes in S2
+numPairs = $10; # number of attribute pairs (|S1|*|S2|)
+maxC = $11; # max number of categories in any categorical attribute
+
+s1size = ncol(S1);
+s2size = ncol(S2);
+
+# R, chisq, cramers, spearman, eta, anovaf
+numstats = 8;
+basestats = matrix(0, rows=numstats, cols=numPairs);
+cat_counts = matrix(0, rows=maxC, cols=numPairs);
+cat_means = matrix(0, rows=maxC, cols=numPairs);
+cat_vars = matrix(0, rows=maxC, cols=numPairs);
+
+
+parfor( i in 1:s1size, par=4, mode=LOCAL, check=0, opt=NONE) {
+ a1 = castAsScalar(S1[,i]);
+ k1 = castAsScalar(K1[1,i]);
+ A1 = D[,a1];
+
+ parfor( j in 1:s2size, par=4, mode=LOCAL, check=0, opt=NONE) {
+ pairID = (i-1)*s2size+j;
+ a2 = castAsScalar(S2[,j]);
+ k2 = castAsScalar(K2[1,j]);
+ A2 = D[,a2];
+
+ if (k1 == k2) {
+ if (k1 == 1) {
+ # scale-scale
+ print("[" + i + "," + j + "] scale-scale");
+ r = bivar_ss(A1,A2);
+ basestats[1,pairID] = r;
+ } else {
+ # nominal-nominal or ordinal-ordinal
+ print("[" + i + "," + j + "] categorical-categorical");
+ [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
+ basestats[2,pairID] = chisq;
+ basestats[3,pairID] = df;
+ basestats[4,pairID] = pval;
+ basestats[5,pairID] = cramersv;
+
+ if ( k1 == 3 ) {
+ # ordinal-ordinal
+ print("[" + i + "," + j + "] ordinal-ordinal");
+ sp = bivar_oo(A1, A2);
+ basestats[6,pairID] = sp;
+ }
+ }
+ }
+ else {
+ if (k1 == 1 | k2 == 1) {
+ # Scale-nominal/ordinal TODO MB correctness errors
+ print("[" + i + "," + j + "] scale-categorical");
+
+ if ( k1 == 1 ) {
+ [eta,f, counts, means, vars] = bivar_sc(A1,A2);
+ }
+ else {
+ [eta,f, counts, means, vars] = bivar_sc(A2,A1);
+ }
+ basestats[7,pairID] = eta;
+ basestats[8,pairID] = f;
+ cat_counts[,pairID] = counts;
+ cat_means[,pairID] = means;
+ cat_vars[,pairID] = vars;
+ }
+ else {
+ # nominal-ordinal or ordinal-nominal
+ print("[" + i + "," + j + "] categorical-categorical");
+ [chisq, df, pval, cramersv] = bivar_cc(A1,A2);
+ basestats[2,pairID] = chisq;
+ basestats[3,pairID] = df;
+ basestats[4,pairID] = pval;
+ basestats[5,pairID] = cramersv;
+ }
+ }
+ }
+}
+
+write(basestats, $6 + "/bivar.stats");
+write(cat_counts, $6 + "/category.counts");
+write(cat_means, $6 + "/category.means");
+write(cat_vars, $6 + "/category.variances");
+
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) {
+
+ # Contingency Table
+ F = table(A,B);
+
+ # Chi-Squared
+ W = sum(F);
+ r = rowSums(F);
+ c = colSums(F);
+ E = (r %*% c)/W;
+ T = (F-E)^2/E;
+ chi_squared = sum(T);
+
+ # compute p-value
+ degFreedom = (nrow(F)-1)*(ncol(F)-1);
+ pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE);
+
+ # Cramer's V
+ R = nrow(F);
+ C = ncol(F);
+ q = min(R,C);
+ cramers_v = sqrt(chi_squared/(W*(q-1)));
+
+ # Assign return values
+ chisq = chi_squared;
+ df = as.double(degFreedom);
+ pval = pValue;
+ cramersv = cramers_v;
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) {
+
+ # Unweighted co-variance
+ covXY = cov(X,Y);
+
+ # compute standard deviations for both X and Y by computing 2^nd central moment
+ W = nrow(X);
+ m2X = moment(X,2);
+ m2Y = moment(Y,2);
+ sigmaX = sqrt(m2X * (W/(W-1.0)) );
+ sigmaY = sqrt(m2Y * (W/(W-1.0)) );
+
+ # Pearson's R
+ R = covXY / (sigmaX*sigmaY);
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+# Y points to SCALE variable
+# A points to CATEGORICAL variable
+bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) {
+
+ # mean and variance in target variable
+ W = nrow(A);
+ my = mean(Y);
+ varY = moment(Y,2) * W/(W-1.0)
+
+ # category-wise (frequencies, means, variances)
+ CFreqs = aggregate(target=Y, groups=A, fn="count");
+ CMeans = aggregate(target=Y, groups=A, fn="mean");
+ CVars = aggregate(target=Y, groups=A, fn="variance");
+
+ # number of categories
+ R = nrow(CFreqs);
+
+ Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) ));
+
+ anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1);
+ anova_den = sum( (CFreqs-1)*CVars )/(W-R);
+ AnovaF = anova_num/anova_den;
+}
+
+# -----------------------------------------------------------------------------------------------------------
+# Function to compute ranks
+# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category
+computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) {
+ Ranks = cumsum(X) - X/2 + 1/2;
+}
+
+#-------------------------------------------------------------------------
+
+bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) {
+
+ # compute contingency table
+ F = table(A,B);
+
+ catA = nrow(F); # number of categories in A
+ catB = ncol(F); # number of categories in B
+
+ # compute category-wise counts for both the attributes
+ R = rowSums(F);
+ S = colSums(F);
+
+ # compute scores, both are column vectors
+ [C] = computeRanks(R);
+ meanX = mean(C,R);
+
+ columnS = t(S);
+ [D] = computeRanks(columnS);
+
+ # scores (C,D) are individual values, and counts (R,S) act as weights
+ meanY = mean(D,columnS);
+
+ W = sum(F); # total weight, or total #cases
+ varX = moment(C,R,2)*(W/(W-1.0));
+ varY = moment(D,columnS,2)*(W/(W-1.0));
+ covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) );
+
+ sp = covXY/(sqrt(varX)*sqrt(varY));
+}
+
+# -----------------------------------------------------------------------------------------------------------
+
+