You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/01/02 20:42:38 UTC

[4/4] incubator-systemml git commit: Improved naive bayes script (matrix aggregate, cleanup), incl new tests

Improved naive bayes script (matrix aggregate, cleanup), incl new tests

Results performance testsuite old/new naive bayes scripts (including
invocation overhead):
a) Hybrid Spark (20GB driver)
10k x 1k, dense: 19s -> 2s 
10k x 1k, sparse: 19s -> 1s
100k x 1k, dense: 22s -> 4s
100k x 1k, sparse: 23s -> 2s
1M x 1k, dense: 87s -> 42s
1M x 1k, sparse: 38s -> 4s
10M x 1k, dense: 226s -> 81s
10M x 1k, sparse: 369s -> 40s
 
b) Hybrid MapReduce (2GB client)
10k x 1k, dense: 3s -> 3s
10k x 1k, sparse: 1s -> 1s
100k x 1k, dense: 58s -> 23s
100k x 1k, sparse: 5s -> 3s
1M x 1k, dense: 98s -> 84s
1M x 1k, sparse: 44s -> 9s
10M x 1k, dense: 253s -> 139s
10M x 1k, sparse: X -> 80s
 





Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/8e7b6ed3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/8e7b6ed3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/8e7b6ed3

Branch: refs/heads/master
Commit: 8e7b6ed3df1c92eda2f0bf1390d1cdd8b59dc7e9
Parents: 5878e1d
Author: Matthias Boehm <mb...@us.ibm.com>
Authored: Fri Jan 1 19:11:55 2016 -0800
Committer: Matthias Boehm <mb...@us.ibm.com>
Committed: Fri Jan 1 19:11:55 2016 -0800

----------------------------------------------------------------------
 scripts/algorithms/naive-bayes.dml              |  13 +-
 .../algorithms/obsolete/naive-bayes-parfor.dml  | 100 +++++++++++++++
 .../applications/NaiveBayesParforTest.java      | 125 +++++++++++++++++++
 .../applications/NaiveBayesTest.java            |   2 +-
 .../dml/NaiveBayesParforDMLTest.java            |  40 ++++++
 .../pydml/NaiveBayesParforPyDMLTest.java        |  40 ++++++
 .../naive-bayes-parfor/naive-bayes.R            |  71 +++++++++++
 .../naive-bayes-parfor/naive-bayes.dml          |  78 ++++++++++++
 .../naive-bayes-parfor/naive-bayes.pydml        |  79 ++++++++++++
 .../applications/naive-bayes-parfor/readme.txt  |   1 +
 .../applications/naive-bayes/naive-bayes.dml    |  10 +-
 .../applications/naive-bayes/naive-bayes.pydml  |   9 +-
 .../integration/applications/ZPackageSuite.java |   2 +
 .../functions/aggregate/ZPackageSuite.java      |   1 +
 14 files changed, 548 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/scripts/algorithms/naive-bayes.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/naive-bayes.dml b/scripts/algorithms/naive-bayes.dml
index c90e64c..f2ad2ed 100644
--- a/scripts/algorithms/naive-bayes.dml
+++ b/scripts/algorithms/naive-bayes.dml
@@ -58,11 +58,7 @@ numFeatures = ncol(D)
 # Compute conditionals
 
 # Compute the feature counts for each class
-classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
-parfor (i in 1:numFeatures) {
-  Col = D[,i]
-  classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
-}
+classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=as.integer(numClasses))
 
 # Compute the total feature count for each class 
 # and add the number of features to this sum
@@ -81,8 +77,8 @@ class_prior = class_counts / numRows;
 
 # Compute accuracy on training set
 ones = matrix(1, rows=numRows, cols=1)
-D_w_ones = append(D, ones)
-model = append(class_conditionals, class_prior)
+D_w_ones = cbind(D, ones)
+model = cbind(class_conditionals, class_prior)
 log_probs = D_w_ones %*% t(log(model))
 pred = rowIndexMax(log_probs)
 acc = sum(ppred(pred, C, "==")) / numRows * 100
@@ -91,8 +87,7 @@ acc_str = "Training Accuracy (%): " + acc
 print(acc_str)
 write(acc_str, $accuracy)
 
-extra_model_params = matrix(0, rows=1, cols=1)
-extra_model_params[1, 1] = numFeatures
+extra_model_params = as.matrix(numFeatures)
 class_prior = t(append(t(class_prior), extra_model_params))
 
 # write out the model

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/scripts/algorithms/obsolete/naive-bayes-parfor.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/obsolete/naive-bayes-parfor.dml b/scripts/algorithms/obsolete/naive-bayes-parfor.dml
new file mode 100644
index 0000000..c90e64c
--- /dev/null
+++ b/scripts/algorithms/obsolete/naive-bayes-parfor.dml
@@ -0,0 +1,100 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multinomial naive Bayes classifier with Laplace correction
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
+#
+
+# defaults
+cmdLine_laplace = ifdef($laplace, 1)
+cmdLine_fmt = ifdef($fmt, "text")
+
+# reading input args
+D = read($X)
+min_feature_val = min(D)
+if(min_feature_val < 0)
+	stop("Stopping due to invalid argument: Multinomial naive Bayes is meant for count-based feature values, minimum value in X is negative")
+numRows = nrow(D)
+if(numRows < 2)
+	stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")
+
+C = read($Y)
+if(min(C) < 1)
+	stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
+numClasses = max(C)
+if(numClasses == 1)
+	stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")	
+mod1 = C %% 1
+mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "==")))
+if(mod1_should_be_nrow != numRows)
+	stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")
+	
+laplace_correction = cmdLine_laplace
+if(laplace_correction < 0)
+	stop("Stopping due to invalid argument: Laplacian correction (laplace) must be non-negative")
+
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
+parfor (i in 1:numFeatures) {
+  Col = D[,i]
+  classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
+}
+
+# Compute the total feature count for each class 
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+#ones = matrix(1, rows=1, cols=numFeatures)
+#repClassSums = classSums %*% ones
+#class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
+class_conditionals = (classFeatureCounts + laplace_correction) / classSums
+
+# Compute class priors
+class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
+class_prior = class_counts / numRows;
+
+# Compute accuracy on training set
+ones = matrix(1, rows=numRows, cols=1)
+D_w_ones = append(D, ones)
+model = append(class_conditionals, class_prior)
+log_probs = D_w_ones %*% t(log(model))
+pred = rowIndexMax(log_probs)
+acc = sum(ppred(pred, C, "==")) / numRows * 100
+
+acc_str = "Training Accuracy (%): " + acc
+print(acc_str)
+write(acc_str, $accuracy)
+
+extra_model_params = matrix(0, rows=1, cols=1)
+extra_model_params[1, 1] = numFeatures
+class_prior = t(append(t(class_prior), extra_model_params))
+
+# write out the model
+write(class_prior, $prior, format=cmdLine_fmt);
+write(class_conditionals, $conditionals, format=cmdLine_fmt);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesParforTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesParforTest.java b/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesParforTest.java
new file mode 100644
index 0000000..7387e16
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesParforTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.applications;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+
+import org.junit.runners.Parameterized.Parameters;
+
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.utils.TestUtils;
+
+public abstract class NaiveBayesParforTest  extends AutomatedTestBase{
+	
+	protected final static String TEST_DIR = "applications/naive-bayes-parfor/";
+	protected final static String TEST_NAME = "naive-bayes";
+	protected String TEST_CLASS_DIR = TEST_DIR + NaiveBayesParforTest.class.getSimpleName() + "/";
+
+	protected int numRecords, numFeatures, numClasses;
+    protected double sparsity;
+    
+    public NaiveBayesParforTest(int rows, int cols, int nc, double sp) {
+		numRecords = rows;
+		numFeatures = cols;
+		numClasses = nc;
+		sparsity = sp;
+	}
+    
+    @Parameters
+	 public static Collection<Object[]> data() {
+	   Object[][] data = new Object[][] { 
+			   //sparse tests (sparsity=0.01)
+			   {100, 50, 10, 0.01}, // example running time: 3.5s (dml: .3s)
+			   {1000, 500, 10, 0.01}, // example running time: 5s (dml: .8s)
+			   {10000, 750, 10, 0.01}, // example running time: 32s (dml: .7s)
+			   //{100000, 1000, 10, 0.01}, // example running time: 471s (dml: 3s)
+			   //dense tests (sparsity=0.7)
+			   {100, 50, 10, 0.7}, // example running time: 2s (dml: .2s)
+			   {1000, 500, 10, 0.7}, // example running time: 6s (dml: .7s)
+			   {10000, 750, 10, 0.7} // example running time: 61s (dml: 5.6s)
+			   };
+	   return Arrays.asList(data);
+	 }
+	 
+	 @Override
+	 public void setUp() {
+		 addTestConfiguration(TEST_CLASS_DIR, TEST_NAME);
+	 }
+	 
+	 protected void testNaiveBayes(ScriptType scriptType)
+	 {
+		 System.out.println("------------ BEGIN " + TEST_NAME + " " + scriptType + " TEST {" + numRecords + ", "
+					+ numFeatures + ", " + numClasses + ", " + sparsity + "} ------------");
+		 this.scriptType = scriptType;
+		 
+		 int rows = numRecords;
+		 int cols = numFeatures;
+		 int classes = numClasses;
+		 double sparsity = this.sparsity;
+		 double laplace_correction = 1;
+	        
+		 getAndLoadTestConfiguration(TEST_NAME);
+	     
+		 List<String> proArgs = new ArrayList<String>();
+		 if (scriptType == ScriptType.PYDML) {
+			 proArgs.add("-python");
+		 }
+		 proArgs.add("-stats");
+		 proArgs.add("-nvargs");
+		 proArgs.add("X=" + input("X"));
+		 proArgs.add("Y=" + input("Y"));
+		 proArgs.add("classes=" + classes);
+		 proArgs.add("laplace=" + laplace_correction);
+		 proArgs.add("prior=" + output("prior"));
+		 proArgs.add("conditionals=" + output("conditionals"));
+		 proArgs.add("accuracy=" + output("accuracy"));
+		 programArgs = proArgs.toArray(new String[proArgs.size()]);
+		
+		 fullDMLScriptName = getScript();
+
+		 rCmd = getRCmd(inputDir(), Integer.toString(classes), Double.toString(laplace_correction), expectedDir());
+		 
+		 double[][] X = getRandomMatrix(rows, cols, 0, 1, sparsity, -1);
+		 double[][] Y = getRandomMatrix(rows, 1, 0, 1, 1, -1);
+		 for(int i=0; i<rows; i++){
+			 Y[i][0] = (int)(Y[i][0]*classes) + 1;
+			 Y[i][0] = (Y[i][0] > classes) ? classes : Y[i][0];
+	     }	
+	        
+		 writeInputMatrixWithMTD("X", X, true);
+		 writeInputMatrixWithMTD("Y", Y, true);
+	        
+		 runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+	        
+		 runRScript(true);
+	        
+		 HashMap<CellIndex, Double> priorR = readRMatrixFromFS("prior");
+		 HashMap<CellIndex, Double> priorSYSTEMML= readDMLMatrixFromHDFS("prior");
+		 HashMap<CellIndex, Double> conditionalsR = readRMatrixFromFS("conditionals");
+		 HashMap<CellIndex, Double> conditionalsSYSTEMML = readDMLMatrixFromHDFS("conditionals"); 
+		 TestUtils.compareMatrices(priorR, priorSYSTEMML, Math.pow(10, -12), "priorR", "priorSYSTEMML");
+		 TestUtils.compareMatrices(conditionalsR, conditionalsSYSTEMML, Math.pow(10.0, -12.0), "conditionalsR", "conditionalsSYSTEMML");
+	 }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesTest.java b/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesTest.java
index c4bdd53..8030cb9 100644
--- a/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/applications/NaiveBayesTest.java
@@ -54,7 +54,7 @@ public abstract class NaiveBayesTest  extends AutomatedTestBase{
 			   {100, 50, 10, 0.01}, // example running time: 3.5s (dml: .3s)
 			   {1000, 500, 10, 0.01}, // example running time: 5s (dml: .8s)
 			   {10000, 750, 10, 0.01}, // example running time: 32s (dml: .7s)
-			   {100000, 1000, 10, 0.01}, // example running time: 471s (dml: 3s)
+			   //{100000, 1000, 10, 0.01}, // example running time: 471s (dml: 3s)
 			   //dense tests (sparsity=0.7)
 			   {100, 50, 10, 0.7}, // example running time: 2s (dml: .2s)
 			   {1000, 500, 10, 0.7}, // example running time: 6s (dml: .7s)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/java/org/apache/sysml/test/integration/applications/dml/NaiveBayesParforDMLTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/applications/dml/NaiveBayesParforDMLTest.java b/src/test/java/org/apache/sysml/test/integration/applications/dml/NaiveBayesParforDMLTest.java
new file mode 100644
index 0000000..8320867
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/applications/dml/NaiveBayesParforDMLTest.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.applications.dml;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.apache.sysml.test.integration.applications.NaiveBayesParforTest;
+
+@RunWith(value = Parameterized.class)
+public class NaiveBayesParforDMLTest extends NaiveBayesParforTest {
+
+	public NaiveBayesParforDMLTest(int rows, int cols, int nc, double sp) {
+		super(rows, cols, nc, sp);
+		TEST_CLASS_DIR = TEST_DIR + NaiveBayesParforDMLTest.class.getSimpleName() + "/";
+	}
+
+	@Test
+	public void testNaiveBayesDml() {
+		testNaiveBayes(ScriptType.DML);
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/java/org/apache/sysml/test/integration/applications/pydml/NaiveBayesParforPyDMLTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/applications/pydml/NaiveBayesParforPyDMLTest.java b/src/test/java/org/apache/sysml/test/integration/applications/pydml/NaiveBayesParforPyDMLTest.java
new file mode 100644
index 0000000..cb6538f
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/integration/applications/pydml/NaiveBayesParforPyDMLTest.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.applications.pydml;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.apache.sysml.test.integration.applications.NaiveBayesParforTest;
+
+@RunWith(value = Parameterized.class)
+public class NaiveBayesParforPyDMLTest extends NaiveBayesParforTest {
+
+	public NaiveBayesParforPyDMLTest(int rows, int cols, int nc, double sp) {
+		super(rows, cols, nc, sp);
+		TEST_CLASS_DIR = TEST_DIR + NaiveBayesParforPyDMLTest.class.getSimpleName() + "/";
+	}
+
+	@Test
+	public void testNaiveBayesPyDml() {
+		testNaiveBayes(ScriptType.PYDML);
+	}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R
new file mode 100644
index 0000000..dc65b8a
--- /dev/null
+++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R
@@ -0,0 +1,71 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+
+library("Matrix")
+
+D = as.matrix(readMM(paste(args[1], "X.mtx", sep="")))
+C = as.matrix(readMM(paste(args[1], "Y.mtx", sep="")))
+
+# reading input args
+numClasses = as.integer(args[2]);
+laplace_correction = as.double(args[3]);
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = matrix(0, numClasses, numFeatures)
+for (i in 1:numFeatures) {
+  Col = D[,i]
+  classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2];
+}
+
+# Compute the total feature count for each class 
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+ones = matrix(1, 1, numFeatures)
+repClassSums = classSums %*% ones;
+class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums;
+
+# Compute class priors
+class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2]
+class_prior = class_counts / numRows;
+
+# Compute accuracy on training set
+ones = matrix(1, numRows, 1)
+D_w_ones = cbind(D, ones)
+model = cbind(class_conditionals, class_prior)
+log_probs = D_w_ones %*% t(log(model))
+pred = max.col(log_probs,ties.method="last");
+acc = sum(pred == C) / numRows * 100
+
+print(paste("Training Accuracy (%): ", acc, sep=""))
+
+# write out the model
+writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep=""));
+writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep=""));

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml
new file mode 100644
index 0000000..d0ce8d1
--- /dev/null
+++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml
@@ -0,0 +1,78 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multinomial naive Bayes classifier with Laplace correction
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
+#
+
+# defaults
+# $laplace = 1
+fmt = ifdef($fmt, "text")
+
+# reading input args
+numClasses = $classes
+D = read($X)
+C = read($Y)
+laplace_correction = ifdef($laplace, 1)
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
+parfor (i in 1:numFeatures) {
+  Col = D[,i]
+  classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
+}
+
+# Compute the total feature count for each class 
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+ones = matrix(1, rows=1, cols=numFeatures)
+repClassSums = classSums %*% ones
+class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
+
+# Compute class priors
+class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
+class_prior = class_counts / numRows;
+
+# Compute accuracy on training set
+ones = matrix(1, rows=numRows, cols=1)
+D_w_ones = append(D, ones)
+model = append(class_conditionals, class_prior)
+log_probs = D_w_ones %*% t(log(model))
+pred = rowIndexMax(log_probs)
+acc = sum(ppred(pred, C, "==")) / numRows * 100
+
+acc_str = "Training Accuracy (%): " + acc
+print(acc_str)
+write(acc_str, $accuracy)
+
+# write out the model
+write(class_prior, $prior, format=fmt);
+write(class_conditionals, $conditionals, format=fmt);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml
new file mode 100644
index 0000000..5d84951
--- /dev/null
+++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml
@@ -0,0 +1,79 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multinomial naive Bayes classifier with Laplace correction
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
+#
+
+# defaults
+# $laplace = 1
+fmt = ifdef($fmt, "text")
+
+# reading input args
+numClasses = $classes
+D = load($X)
+C = load($Y)
+laplace_correction = ifdef($laplace, 1)
+
+numRows = nrow(D)
+numFeatures = ncol(D)
+
+# Compute conditionals
+
+# Compute the feature counts for each class
+classFeatureCounts = full(0, rows=numClasses, cols=numFeatures)
+parfor (i in 1:numFeatures):
+    Col = D[,i]
+    classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=numClasses)
+
+# Compute the total feature count for each class 
+# and add the number of features to this sum
+# for subsequent regularization (Laplace's rule)
+classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction
+
+# Compute class conditional probabilities
+ones = full(1, rows=1, cols=numFeatures)
+repClassSums = dot(classSums, ones)
+class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
+
+# Compute class priors
+class_counts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses)
+class_prior = class_counts / numRows
+
+# Compute accuracy on training set
+ones = full(1, rows=numRows, cols=1)
+D_w_ones = append(D, ones)
+model = append(class_conditionals, class_prior)
+log_model = log(model)
+transpose_log_model = log_model.transpose()
+log_probs = dot(D_w_ones, transpose_log_model)
+pred = rowIndexMax(log_probs)
+acc = sum(ppred(pred, C, "==")) / numRows * 100
+
+acc_str = "Training Accuracy (%): " + acc
+print(acc_str)
+save(acc_str, $accuracy)
+
+# write out the model
+save(class_prior, $prior, format=fmt)
+save(class_conditionals, $conditionals, format=fmt)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/scripts/applications/naive-bayes-parfor/readme.txt
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes-parfor/readme.txt b/src/test/scripts/applications/naive-bayes-parfor/readme.txt
new file mode 100644
index 0000000..b68e0ce
--- /dev/null
+++ b/src/test/scripts/applications/naive-bayes-parfor/readme.txt
@@ -0,0 +1 @@
+This is the old naive bayes script, implemented via parfor. With the new grouped aggregate over matrices, this is not required anymore. However, for testing purposes we run both naive bayes scripts.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/scripts/applications/naive-bayes/naive-bayes.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.dml b/src/test/scripts/applications/naive-bayes/naive-bayes.dml
index d0ce8d1..9b1558c 100644
--- a/src/test/scripts/applications/naive-bayes/naive-bayes.dml
+++ b/src/test/scripts/applications/naive-bayes/naive-bayes.dml
@@ -41,11 +41,7 @@ numFeatures = ncol(D)
 # Compute conditionals
 
 # Compute the feature counts for each class
-classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
-parfor (i in 1:numFeatures) {
-  Col = D[,i]
-  classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
-}
+classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=as.integer(numClasses));
 
 # Compute the total feature count for each class 
 # and add the number of features to this sum
@@ -63,8 +59,8 @@ class_prior = class_counts / numRows;
 
 # Compute accuracy on training set
 ones = matrix(1, rows=numRows, cols=1)
-D_w_ones = append(D, ones)
-model = append(class_conditionals, class_prior)
+D_w_ones = cbind(D, ones)
+model = cbind(class_conditionals, class_prior)
 log_probs = D_w_ones %*% t(log(model))
 pred = rowIndexMax(log_probs)
 acc = sum(ppred(pred, C, "==")) / numRows * 100

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.pydml b/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
index 5d84951..073e1cb 100644
--- a/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
+++ b/src/test/scripts/applications/naive-bayes/naive-bayes.pydml
@@ -41,10 +41,7 @@ numFeatures = ncol(D)
 # Compute conditionals
 
 # Compute the feature counts for each class
-classFeatureCounts = full(0, rows=numClasses, cols=numFeatures)
-parfor (i in 1:numFeatures):
-    Col = D[,i]
-    classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=numClasses)
+classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=numClasses);
 
 # Compute the total feature count for each class 
 # and add the number of features to this sum
@@ -62,8 +59,8 @@ class_prior = class_counts / numRows
 
 # Compute accuracy on training set
 ones = full(1, rows=numRows, cols=1)
-D_w_ones = append(D, ones)
-model = append(class_conditionals, class_prior)
+D_w_ones = cbind(D, ones)
+model = cbind(class_conditionals, class_prior)
 log_model = log(model)
 transpose_log_model = log_model.transpose()
 log_probs = dot(D_w_ones, transpose_log_model)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test_suites/java/org/apache/sysml/test/integration/applications/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/applications/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/applications/ZPackageSuite.java
index 7dcb816..f9d1e91 100644
--- a/src/test_suites/java/org/apache/sysml/test/integration/applications/ZPackageSuite.java
+++ b/src/test_suites/java/org/apache/sysml/test/integration/applications/ZPackageSuite.java
@@ -43,6 +43,7 @@ import org.junit.runners.Suite;
   org.apache.sysml.test.integration.applications.dml.MDABivariateStatsDMLTest.class,
   org.apache.sysml.test.integration.applications.dml.MultiClassSVMDMLTest.class,
   org.apache.sysml.test.integration.applications.dml.NaiveBayesDMLTest.class,
+  org.apache.sysml.test.integration.applications.dml.NaiveBayesParforDMLTest.class,
   org.apache.sysml.test.integration.applications.dml.PageRankDMLTest.class,
   org.apache.sysml.test.integration.applications.dml.WelchTDMLTest.class,
 
@@ -61,6 +62,7 @@ import org.junit.runners.Suite;
   org.apache.sysml.test.integration.applications.pydml.MDABivariateStatsPyDMLTest.class,
   org.apache.sysml.test.integration.applications.pydml.MultiClassSVMPyDMLTest.class,
   org.apache.sysml.test.integration.applications.pydml.NaiveBayesPyDMLTest.class,
+  org.apache.sysml.test.integration.applications.pydml.NaiveBayesParforPyDMLTest.class,
   org.apache.sysml.test.integration.applications.pydml.PageRankPyDMLTest.class,
   org.apache.sysml.test.integration.applications.pydml.WelchTPyDMLTest.class
   

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8e7b6ed3/src/test_suites/java/org/apache/sysml/test/integration/functions/aggregate/ZPackageSuite.java
----------------------------------------------------------------------
diff --git a/src/test_suites/java/org/apache/sysml/test/integration/functions/aggregate/ZPackageSuite.java b/src/test_suites/java/org/apache/sysml/test/integration/functions/aggregate/ZPackageSuite.java
index bf284fd..39432c5 100644
--- a/src/test_suites/java/org/apache/sysml/test/integration/functions/aggregate/ZPackageSuite.java
+++ b/src/test_suites/java/org/apache/sysml/test/integration/functions/aggregate/ZPackageSuite.java
@@ -44,6 +44,7 @@ import org.junit.runners.Suite;
 	FullAggregateTest.class,
 	FullColAggregateTest.class,
 	FullGroupedAggregateTest.class,
+	FullGroupedAggregateMatrixTest.class,
 	FullRowAggregateTest.class
 })