You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:18 UTC
[42/51] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/l2-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml
index 04b83b0..140ac5a 100644
--- a/scripts/algorithms/l2-svm.dml
+++ b/scripts/algorithms/l2-svm.dml
@@ -1,159 +1,159 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements binary-class SVM with squared slack variables
-#
-# Example Usage:
-# Assume L2SVM_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume epsilon = 0.001, lambda = 1, maxiterations = 100
-#
-# hadoop jar SystemML.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/Y icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
-#
-# Note about inputs:
-# Assumes that labels (entries in Y)
-# are set to either -1 or +1
-# or the result of recoding
-#
-
-cmdLine_fmt = ifdef($fmt, "text")
-cmdLine_icpt = ifdef($icpt, 0)
-cmdLine_tol = ifdef($tol, 0.001)
-cmdLine_reg = ifdef($reg, 1.0)
-cmdLine_maxiter = ifdef($maxiter, 100)
-
-X = read($X)
-Y = read($Y)
-
-if(nrow(X) < 2)
- stop("Stopping due to invalid inputs: Not possible to learn a binary class classifier without at least 2 rows")
-
-check_min = min(Y)
-check_max = max(Y)
-num_min = sum(ppred(Y, check_min, "=="))
-num_max = sum(ppred(Y, check_max, "=="))
-
-if(check_min == check_max)
- stop("Stopping due to invalid inputs: Y seems to contain exactly one label")
-
-if(num_min + num_max != nrow(Y))
- stop("Stopping due to invalid inputs: Y seems to contain more than 2 labels")
-
-if(check_min != -1 | check_max != +1)
- Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min)
-
-positive_label = check_max
-negative_label = check_min
-
-continue = 1
-
-intercept = cmdLine_icpt
-if(intercept != 0 & intercept != 1)
- stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
-
-epsilon = cmdLine_tol
-if(epsilon < 0)
- stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
-
-lambda = cmdLine_reg
-if(lambda < 0)
- stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
-
-maxiterations = cmdLine_maxiter
-if(maxiterations < 1)
- stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
-
-num_samples = nrow(X)
-dimensions = ncol(X)
-
-if (intercept == 1) {
- ones = matrix(1, rows=num_samples, cols=1)
- X = append(X, ones);
-}
-
-num_rows_in_w = dimensions
-if(intercept == 1){
- num_rows_in_w = num_rows_in_w + 1
-}
-w = matrix(0, rows=num_rows_in_w, cols=1)
-
-g_old = t(X) %*% Y
-s = g_old
-
-Xw = matrix(0, rows=nrow(X), cols=1)
-debug_str = "# Iter, Obj"
-iter = 0
-while(continue == 1 & iter < maxiterations) {
- # minimizing primal obj along direction s
- step_sz = 0
- Xd = X %*% s
- wd = lambda * sum(w * s)
- dd = lambda * sum(s * s)
- continue1 = 1
- while(continue1 == 1){
- tmp_Xw = Xw + step_sz*Xd
- out = 1 - Y * (tmp_Xw)
- sv = ppred(out, 0, ">")
- out = out * sv
- g = wd + step_sz*dd - sum(out * Y * Xd)
- h = dd + sum(Xd * sv * Xd)
- step_sz = step_sz - g/h
- if (g*g/h < 0.0000000001){
- continue1 = 0
- }
- }
-
- #update weights
- w = w + step_sz*s
- Xw = Xw + step_sz*Xd
-
- out = 1 - Y * Xw
- sv = ppred(out, 0, ">")
- out = sv * out
- obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
- g_new = t(X) %*% (out * Y) - lambda * w
-
- print("ITER " + iter + ": OBJ=" + obj)
- debug_str = append(debug_str, iter + "," + obj)
-
- tmp = sum(s * g_old)
- if(step_sz*tmp < epsilon*obj){
- continue = 0
- }
-
- #non-linear CG step
- be = sum(g_new * g_new)/sum(g_old * g_old)
- s = be * s + g_new
- g_old = g_new
-
- iter = iter + 1
-}
-
-extra_model_params = matrix(0, rows=4, cols=1)
-extra_model_params[1,1] = positive_label
-extra_model_params[2,1] = negative_label
-extra_model_params[3,1] = intercept
-extra_model_params[4,1] = dimensions
-
-w = t(append(t(w), t(extra_model_params)))
-write(w, $model, format=cmdLine_fmt)
-
-write(debug_str, $Log)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements binary-class SVM with squared slack variables
+#
+# Example Usage:
+# Assume L2SVM_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume epsilon = 0.001, lambda = 1, maxiterations = 100
+#
+# hadoop jar SystemML.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/Y icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
+#
+# Note about inputs:
+# Assumes that labels (entries in Y)
+# are set to either -1 or +1
+# or the result of recoding
+#
+
+cmdLine_fmt = ifdef($fmt, "text")
+cmdLine_icpt = ifdef($icpt, 0)
+cmdLine_tol = ifdef($tol, 0.001)
+cmdLine_reg = ifdef($reg, 1.0)
+cmdLine_maxiter = ifdef($maxiter, 100)
+
+X = read($X)
+Y = read($Y)
+
+if(nrow(X) < 2)
+ stop("Stopping due to invalid inputs: Not possible to learn a binary class classifier without at least 2 rows")
+
+check_min = min(Y)
+check_max = max(Y)
+num_min = sum(ppred(Y, check_min, "=="))
+num_max = sum(ppred(Y, check_max, "=="))
+
+if(check_min == check_max)
+ stop("Stopping due to invalid inputs: Y seems to contain exactly one label")
+
+if(num_min + num_max != nrow(Y))
+ stop("Stopping due to invalid inputs: Y seems to contain more than 2 labels")
+
+if(check_min != -1 | check_max != +1)
+ Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min)
+
+positive_label = check_max
+negative_label = check_min
+
+continue = 1
+
+intercept = cmdLine_icpt
+if(intercept != 0 & intercept != 1)
+ stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
+
+epsilon = cmdLine_tol
+if(epsilon < 0)
+ stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
+
+lambda = cmdLine_reg
+if(lambda < 0)
+ stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
+
+maxiterations = cmdLine_maxiter
+if(maxiterations < 1)
+ stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
+
+num_samples = nrow(X)
+dimensions = ncol(X)
+
+if (intercept == 1) {
+ ones = matrix(1, rows=num_samples, cols=1)
+ X = append(X, ones);
+}
+
+num_rows_in_w = dimensions
+if(intercept == 1){
+ num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, rows=num_rows_in_w, cols=1)
+
+g_old = t(X) %*% Y
+s = g_old
+
+Xw = matrix(0, rows=nrow(X), cols=1)
+debug_str = "# Iter, Obj"
+iter = 0
+while(continue == 1 & iter < maxiterations) {
+ # minimizing primal obj along direction s
+ step_sz = 0
+ Xd = X %*% s
+ wd = lambda * sum(w * s)
+ dd = lambda * sum(s * s)
+ continue1 = 1
+ while(continue1 == 1){
+ tmp_Xw = Xw + step_sz*Xd
+ out = 1 - Y * (tmp_Xw)
+ sv = ppred(out, 0, ">")
+ out = out * sv
+ g = wd + step_sz*dd - sum(out * Y * Xd)
+ h = dd + sum(Xd * sv * Xd)
+ step_sz = step_sz - g/h
+ if (g*g/h < 0.0000000001){
+ continue1 = 0
+ }
+ }
+
+ #update weights
+ w = w + step_sz*s
+ Xw = Xw + step_sz*Xd
+
+ out = 1 - Y * Xw
+ sv = ppred(out, 0, ">")
+ out = sv * out
+ obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
+ g_new = t(X) %*% (out * Y) - lambda * w
+
+ print("ITER " + iter + ": OBJ=" + obj)
+ debug_str = append(debug_str, iter + "," + obj)
+
+ tmp = sum(s * g_old)
+ if(step_sz*tmp < epsilon*obj){
+ continue = 0
+ }
+
+ #non-linear CG step
+ be = sum(g_new * g_new)/sum(g_old * g_old)
+ s = be * s + g_new
+ g_old = g_new
+
+ iter = iter + 1
+}
+
+extra_model_params = matrix(0, rows=4, cols=1)
+extra_model_params[1,1] = positive_label
+extra_model_params[2,1] = negative_label
+extra_model_params[3,1] = intercept
+extra_model_params[4,1] = dimensions
+
+w = t(append(t(w), t(extra_model_params)))
+write(w, $model, format=cmdLine_fmt)
+
+write(debug_str, $Log)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/m-svm-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/m-svm-predict.dml b/scripts/algorithms/m-svm-predict.dml
index 4d0c736..ba06cf6 100644
--- a/scripts/algorithms/m-svm-predict.dml
+++ b/scripts/algorithms/m-svm-predict.dml
@@ -1,84 +1,84 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# This script can be used to compute label predictions
-# Meant for use with an SVM model (learnt using m-svm.dml) on a held out test set
-#
-# Given ground truth labels, the script will compute an
-# accuracy (%) for the predictions
-#
-# Example Usage:
-# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels model=model scores=scores accuracy=accuracy confusion=confusion fmt="text"
-#
-
-cmdLine_Y = ifdef($Y, " ")
-cmdLine_confusion = ifdef($confusion, " ")
-cmdLine_accuracy = ifdef($accuracy, " ")
-cmdLine_scores = ifdef($scores, " ")
-cmdLine_fmt = ifdef($fmt, "text")
-
-X = read($X);
-W = read($model);
-
-dimensions = as.scalar(W[nrow(W),1])
-if(dimensions != ncol(X))
- stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions")
-
-intercept = as.scalar(W[nrow(W)-1,1])
-W = W[1:(nrow(W)-2),]
-
-N = nrow(X);
-num_classes = ncol(W)
-m=ncol(X);
-
-b = matrix(0, rows=1, cols=num_classes)
-if (intercept == 1)
- b = W[m+1,]
-
-ones = matrix(1, rows=N, cols=1)
-scores = X %*% W[1:m,] + ones %*% b;
-
-if(cmdLine_scores != " ")
- write(scores, cmdLine_scores, format=cmdLine_fmt);
-
-if(cmdLine_Y != " "){
- y = read(cmdLine_Y);
-
- if(min(y) < 1)
- stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
-
- pred = rowIndexMax(scores);
- correct_percentage = sum(ppred(pred - y, 0, "==")) / N * 100;
-
- acc_str = "Accuracy (%): " + correct_percentage
- print(acc_str)
- if(cmdLine_accuracy != " ")
- write(acc_str, cmdLine_accuracy)
-
- num_classes_ground_truth = max(y)
- if(num_classes < num_classes_ground_truth)
- num_classes = num_classes_ground_truth
-
- if(cmdLine_confusion != " "){
- confusion_mat = table(pred, y, num_classes, num_classes)
- write(confusion_mat, cmdLine_confusion, format="csv")
- }
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script can be used to compute label predictions
+# Meant for use with an SVM model (learnt using m-svm.dml) on a held out test set
+#
+# Given ground truth labels, the script will compute an
+# accuracy (%) for the predictions
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels model=model scores=scores accuracy=accuracy confusion=confusion fmt="text"
+#
+
+cmdLine_Y = ifdef($Y, " ")
+cmdLine_confusion = ifdef($confusion, " ")
+cmdLine_accuracy = ifdef($accuracy, " ")
+cmdLine_scores = ifdef($scores, " ")
+cmdLine_fmt = ifdef($fmt, "text")
+
+X = read($X);
+W = read($model);
+
+dimensions = as.scalar(W[nrow(W),1])
+if(dimensions != ncol(X))
+ stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions")
+
+intercept = as.scalar(W[nrow(W)-1,1])
+W = W[1:(nrow(W)-2),]
+
+N = nrow(X);
+num_classes = ncol(W)
+m=ncol(X);
+
+b = matrix(0, rows=1, cols=num_classes)
+if (intercept == 1)
+ b = W[m+1,]
+
+ones = matrix(1, rows=N, cols=1)
+scores = X %*% W[1:m,] + ones %*% b;
+
+if(cmdLine_scores != " ")
+ write(scores, cmdLine_scores, format=cmdLine_fmt);
+
+if(cmdLine_Y != " "){
+ y = read(cmdLine_Y);
+
+ if(min(y) < 1)
+ stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
+
+ pred = rowIndexMax(scores);
+ correct_percentage = sum(ppred(pred - y, 0, "==")) / N * 100;
+
+ acc_str = "Accuracy (%): " + correct_percentage
+ print(acc_str)
+ if(cmdLine_accuracy != " ")
+ write(acc_str, cmdLine_accuracy)
+
+ num_classes_ground_truth = max(y)
+ if(num_classes < num_classes_ground_truth)
+ num_classes = num_classes_ground_truth
+
+ if(cmdLine_confusion != " "){
+ confusion_mat = table(pred, y, num_classes, num_classes)
+ write(confusion_mat, cmdLine_confusion, format="csv")
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/m-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/m-svm.dml b/scripts/algorithms/m-svm.dml
index c570872..560d46f 100644
--- a/scripts/algorithms/m-svm.dml
+++ b/scripts/algorithms/m-svm.dml
@@ -1,174 +1,174 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multiclass SVM with squared slack variables,
-# learns one-against-the-rest binary-class classifiers
-#
-# Example Usage:
-# Assume SVM_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume epsilon = 0.001, lambda=1.0, max_iterations = 100
-#
-# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
-#
-
-cmdLine_fmt = ifdef($fmt, "text")
-cmdLine_icpt = ifdef($icpt, 0)
-cmdLine_tol = ifdef($tol, 0.001)
-cmdLine_reg = ifdef($reg, 1.0)
-cmdLine_maxiter = ifdef($maxiter, 100)
-
-print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter)
-
-X = read($X)
-
-if(nrow(X) < 2)
- stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")
-
-dimensions = ncol(X)
-
-Y = read($Y)
-
-if(nrow(X) != nrow(Y))
- stop("Stopping due to invalid argument: Numbers of rows in X and Y must match")
-
-intercept = cmdLine_icpt
-if(intercept != 0 & intercept != 1)
- stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
-
-min_y = min(Y)
-if(min_y < 1)
- stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
-num_classes = max(Y)
-if(num_classes == 1)
- stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")
-mod1 = Y %% 1
-mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "==")))
-if(mod1_should_be_nrow != nrow(Y))
- stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")
-
-epsilon = cmdLine_tol
-if(epsilon < 0)
- stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
-
-lambda = cmdLine_reg
-if(lambda < 0)
- stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
-
-max_iterations = cmdLine_maxiter
-if(max_iterations < 1)
- stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
-
-num_samples = nrow(X)
-num_features = ncol(X)
-
-if (intercept == 1) {
- ones = matrix(1, rows=num_samples, cols=1);
- X = append(X, ones);
-}
-
-num_rows_in_w = num_features
-if(intercept == 1){
- num_rows_in_w = num_rows_in_w + 1
-}
-w = matrix(0, rows=num_rows_in_w, cols=num_classes)
-
-debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
-parfor(iter_class in 1:num_classes){
- Y_local = 2 * ppred(Y, iter_class, "==") - 1
- w_class = matrix(0, rows=num_features, cols=1)
- if (intercept == 1) {
- zero_matrix = matrix(0, rows=1, cols=1);
- w_class = t(append(t(w_class), zero_matrix));
- }
-
- g_old = t(X) %*% Y_local
- s = g_old
-
- Xw = matrix(0, rows=nrow(X), cols=1)
- iter = 0
- continue = 1
- while(continue == 1) {
- # minimizing primal obj along direction s
- step_sz = 0
- Xd = X %*% s
- wd = lambda * sum(w_class * s)
- dd = lambda * sum(s * s)
- continue1 = 1
- while(continue1 == 1){
- tmp_Xw = Xw + step_sz*Xd
- out = 1 - Y_local * (tmp_Xw)
- sv = ppred(out, 0, ">")
- out = out * sv
- g = wd + step_sz*dd - sum(out * Y_local * Xd)
- h = dd + sum(Xd * sv * Xd)
- step_sz = step_sz - g/h
- if (g*g/h < 0.0000000001){
- continue1 = 0
- }
- }
-
- #update weights
- w_class = w_class + step_sz*s
- Xw = Xw + step_sz*Xd
-
- out = 1 - Y_local * Xw
- sv = ppred(out, 0, ">")
- out = sv * out
- obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
- g_new = t(X) %*% (out * Y_local) - lambda * w_class
-
- tmp = sum(s * g_old)
-
- train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100
- print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
- debug_mat[iter+1,iter_class] = obj
-
- if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
- continue = 0
- }
-
- #non-linear CG step
- be = sum(g_new * g_new)/sum(g_old * g_old)
- s = be * s + g_new
- g_old = g_new
-
- iter = iter + 1
- }
-
- w[,iter_class] = w_class
-}
-
-extra_model_params = matrix(0, rows=2, cols=ncol(w))
-extra_model_params[1, 1] = intercept
-extra_model_params[2, 1] = dimensions
-w = t(append(t(w), t(extra_model_params)))
-write(w, $model, format=cmdLine_fmt)
-
-debug_str = "# Class, Iter, Obj"
-for(iter_class in 1:ncol(debug_mat)){
- for(iter in 1:nrow(debug_mat)){
- obj = castAsScalar(debug_mat[iter, iter_class])
- if(obj != -1)
- debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
- }
-}
-write(debug_str, $Log)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multiclass SVM with squared slack variables,
+# learns one-against-the-rest binary-class classifiers
+#
+# Example Usage:
+# Assume SVM_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume epsilon = 0.001, lambda=1.0, max_iterations = 100
+#
+# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text"
+#
+
+cmdLine_fmt = ifdef($fmt, "text")
+cmdLine_icpt = ifdef($icpt, 0)
+cmdLine_tol = ifdef($tol, 0.001)
+cmdLine_reg = ifdef($reg, 1.0)
+cmdLine_maxiter = ifdef($maxiter, 100)
+
+print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter)
+
+X = read($X)
+
+if(nrow(X) < 2)
+ stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")
+
+dimensions = ncol(X)
+
+Y = read($Y)
+
+if(nrow(X) != nrow(Y))
+ stop("Stopping due to invalid argument: Numbers of rows in X and Y must match")
+
+intercept = cmdLine_icpt
+if(intercept != 0 & intercept != 1)
+ stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1")
+
+min_y = min(Y)
+if(min_y < 1)
+ stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
+num_classes = max(Y)
+if(num_classes == 1)
+ stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")
+mod1 = Y %% 1
+mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "==")))
+if(mod1_should_be_nrow != nrow(Y))
+ stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")
+
+epsilon = cmdLine_tol
+if(epsilon < 0)
+ stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative")
+
+lambda = cmdLine_reg
+if(lambda < 0)
+ stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative")
+
+max_iterations = cmdLine_maxiter
+if(max_iterations < 1)
+ stop("Stopping due to invalid argument: Maximum iterations should be a positive integer")
+
+num_samples = nrow(X)
+num_features = ncol(X)
+
+if (intercept == 1) {
+ ones = matrix(1, rows=num_samples, cols=1);
+ X = append(X, ones);
+}
+
+num_rows_in_w = num_features
+if(intercept == 1){
+ num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, rows=num_rows_in_w, cols=num_classes)
+
+debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
+parfor(iter_class in 1:num_classes){
+ Y_local = 2 * ppred(Y, iter_class, "==") - 1
+ w_class = matrix(0, rows=num_features, cols=1)
+ if (intercept == 1) {
+ zero_matrix = matrix(0, rows=1, cols=1);
+ w_class = t(append(t(w_class), zero_matrix));
+ }
+
+ g_old = t(X) %*% Y_local
+ s = g_old
+
+ Xw = matrix(0, rows=nrow(X), cols=1)
+ iter = 0
+ continue = 1
+ while(continue == 1) {
+ # minimizing primal obj along direction s
+ step_sz = 0
+ Xd = X %*% s
+ wd = lambda * sum(w_class * s)
+ dd = lambda * sum(s * s)
+ continue1 = 1
+ while(continue1 == 1){
+ tmp_Xw = Xw + step_sz*Xd
+ out = 1 - Y_local * (tmp_Xw)
+ sv = ppred(out, 0, ">")
+ out = out * sv
+ g = wd + step_sz*dd - sum(out * Y_local * Xd)
+ h = dd + sum(Xd * sv * Xd)
+ step_sz = step_sz - g/h
+ if (g*g/h < 0.0000000001){
+ continue1 = 0
+ }
+ }
+
+ #update weights
+ w_class = w_class + step_sz*s
+ Xw = Xw + step_sz*Xd
+
+ out = 1 - Y_local * Xw
+ sv = ppred(out, 0, ">")
+ out = sv * out
+ obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
+ g_new = t(X) %*% (out * Y_local) - lambda * w_class
+
+ tmp = sum(s * g_old)
+
+ train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100
+ print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)
+ debug_mat[iter+1,iter_class] = obj
+
+ if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
+ continue = 0
+ }
+
+ #non-linear CG step
+ be = sum(g_new * g_new)/sum(g_old * g_old)
+ s = be * s + g_new
+ g_old = g_new
+
+ iter = iter + 1
+ }
+
+ w[,iter_class] = w_class
+}
+
+extra_model_params = matrix(0, rows=2, cols=ncol(w))
+extra_model_params[1, 1] = intercept
+extra_model_params[2, 1] = dimensions
+w = t(append(t(w), t(extra_model_params)))
+write(w, $model, format=cmdLine_fmt)
+
+debug_str = "# Class, Iter, Obj"
+for(iter_class in 1:ncol(debug_mat)){
+ for(iter in 1:nrow(debug_mat)){
+ obj = castAsScalar(debug_mat[iter, iter_class])
+ if(obj != -1)
+ debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
+ }
+}
+write(debug_str, $Log)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/random-forest-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/random-forest-predict.dml b/scripts/algorithms/random-forest-predict.dml
index 2d99670..7bc6cd6 100644
--- a/scripts/algorithms/random-forest-predict.dml
+++ b/scripts/algorithms/random-forest-predict.dml
@@ -1,193 +1,193 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A RANDOM FOREST MODEL ON A HELD OUT TEST SET
-# OR FOR COMPUTING THE OUT-OF-BAG ERROR ON THE TRAINING SET.
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Location to read test feature matrix or training feature matrix for computing Out-Of-Bag error;
-# note that X needs to be both recoded and dummy coded
-# Y String " " Location to read true label matrix Y if requested; note that Y needs to be both recoded and dummy coded
-# R String " " Location to read the matrix R which for each feature in X contains the following information
-# - R[,1]: column ids
-# - R[,2]: start indices
-# - R[,3]: end indices
-# If R is not provided by default all variables are assumed to be scale
-# M String --- Location to read matrix M containing the learned tree i the following format
-# - M[1,j]: id of node j (in a complete binary tree)
-# - M[2,j]: tree id
-# - M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-# - M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
-# - M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
-# otherwise the label that leaf node j is supposed to predict
-# - M[6,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
-# stored in rows 7,8,... if j is categorical
-# If j is a leaf node: number of misclassified samples reaching at node j
-# - M[7:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[7,j]
-# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 7,8,...
-# depict the value subset chosen for j
-# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
-# C String " " Location to read the counts matrix containing the number of times samples are chosen in each tree of the random forest
-# P String --- Location to store the label predictions for X
-# A String " " Location to store the test accuracy (%) for the prediction if requested
-# OOB String " " If C is provided location to store the Out-Of-Bag (OOB) error of the learned model
-# CM String " " Location to store the confusion matrix if requested
-# fmt String "text" The output format of the output, such as "text" or "csv"
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- Matrix Y containing the predicted labels for X
-# 2- Test accuracy if requested
-# 3- Confusion matrix C if requested
-# -------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f random-forest-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
-# A=OUTPUT_DIR/accurcay CM=OUTPUT_DIR/confusion fmt=csv
-
-fileX = $X;
-fileM = $M;
-fileP = $P;
-fileY = ifdef ($Y, " ");
-fileR = ifdef ($R, " ");
-fileC = ifdef ($C, " ");
-fileOOB = ifdef ($OOB, " ");
-fileCM = ifdef ($CM, " ");
-fileA = ifdef ($A, " ");
-fmtO = ifdef ($fmt, "text");
-X = read (fileX);
-M = read (fileM);
-
-num_records = nrow (X);
-Y_predicted = matrix (0, rows = num_records, cols = 1);
-num_trees = max (M[2,]);
-num_labels = max (M[5,]);
-num_nodes_per_tree = aggregate (target = t (M[2,]), groups = t (M[2,]), fn = "count");
-num_nodes_per_tree_cum = cumsum (num_nodes_per_tree);
-
-R_cat = matrix (0, rows = 1, cols = 1);
-R_scale = matrix (0, rows = 1, cols = 1);
-
-if (fileR != " ") {
- R = read (fileR);
- dummy_coded = ppred (R[,2], R[,3], "!=");
- R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows");
- R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
-} else { # only scale features available
- R_scale = seq (1, ncol (X));
-}
-
-if (fileC != " ") {
- C = read (fileC);
- label_counts_oob = matrix (0, rows = num_records, cols = num_labels);
-}
-
-label_counts = matrix (0, rows = num_records, cols = num_labels);
-parfor (i in 1:num_records, check = 0) {
- cur_sample = X[i,];
- cur_node_pos = 1;
- # cur_node = 1;
- cur_tree = 1;
- start_ind = 1;
- labels_found = FALSE;
- while (!labels_found) {
-
- cur_feature = as.scalar (M[4,cur_node_pos]);
- type_label = as.scalar (M[5,cur_node_pos]);
- if (cur_feature == 0) { # leaf found
- label_counts[i,type_label] = label_counts[i,type_label] + 1;
- if (fileC != " ") {
- if (as.scalar (C[i,cur_tree]) == 0) label_counts_oob[i,type_label] = label_counts_oob[i,type_label] + 1;
- }
- if (cur_tree < num_trees) {
- cur_node_pos = as.scalar (num_nodes_per_tree_cum[cur_tree,]) + 1;
- } else if (cur_tree == num_trees) {
- labels_found = TRUE;
- }
- cur_tree = cur_tree + 1;
- } else {
- # determine type: 1 for scale, 2 for categorical
- if (type_label == 1) { # scale feature
- cur_start_ind = as.scalar (R_scale[cur_feature,]);
- cur_value = as.scalar (cur_sample[,cur_start_ind]);
- cur_split = as.scalar (M[7,cur_node_pos]);
- if (cur_value < cur_split) { # go to left branch
- cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]);
- # cur_node = as.scalar (cur_M[1,cur_node_pos]);
- } else { # go to right branch
- cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1;
- # cur_node = as.scalar (cur_M[1,cur_node_pos]);
- }
- } else if (type_label == 2) { # categorical feature
- cur_start_ind = as.scalar (R_cat[cur_feature,1]);
- cur_end_ind = as.scalar (R_cat[cur_feature,2]);
- cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind]));
- cur_offset = as.scalar (M[6,cur_node_pos]);
- value_found = sum (ppred (M[7:(7 + cur_offset - 1),cur_node_pos], cur_value, "=="));
- if (value_found >= 1) { # go to left branch
- cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]);
- # cur_node = as.scalar (cur_M[1,cur_node_pos]);
- } else { # go to right branch
- cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1;
- # cur_node = as.scalar (cur_M[1,cur_node_pos]);
- }
-
- }
-}}}
-
-Y_predicted = rowIndexMax (label_counts);
-write (Y_predicted, fileP, format = fmtO);
-
-if (fileY != " ") {
- Y_dummy = read (fileY);
- num_classes = ncol (Y_dummy);
- Y = rowSums (Y_dummy * t (seq (1, num_classes)));
- result = ppred (Y, Y_predicted, "==");
- result = sum (result);
- accuracy = result / num_records * 100;
- acc_str = "Accuracy (%): " + accuracy;
- if (fileA != " ") {
- write (acc_str, fileA, format = fmtO);
- } else {
- print (acc_str);
- }
- if (fileC != " ") {
- oob_ind = ppred (rowSums (label_counts_oob), 0, ">")
- label_counts_oob = removeEmpty (target = label_counts_oob, margin = "rows");
- num_oob = nrow (label_counts_oob);
- Y_predicted_oob = rowIndexMax (label_counts_oob);
- Y_oob = removeEmpty (target = Y * oob_ind, margin = "rows");
- result = ppred (Y_oob, Y_predicted_oob, "==");
- oob_error = (1 - (sum (result) / num_oob)) * 100;
- oob_str = "Out-Of-Bag error (%): " + oob_error;
- if (fileOOB != " ") {
- write (oob_str, fileOOB, format = fmtO);
- } else {
- print (oob_str);
- }
- }
- if (fileCM != " ") {
- confusion_mat = table(Y_predicted, Y, num_classes, num_classes)
- write(confusion_mat, fileCM, format = fmtO)
- }
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A RANDOM FOREST MODEL ON A HELD OUT TEST SET
+# OR FOR COMPUTING THE OUT-OF-BAG ERROR ON THE TRAINING SET.
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X String --- Location to read test feature matrix or training feature matrix for computing Out-Of-Bag error;
+# note that X needs to be both recoded and dummy coded
+# Y String " " Location to read true label matrix Y if requested; note that Y needs to be both recoded and dummy coded
+# R String " " Location to read the matrix R which for each feature in X contains the following information
+# - R[,1]: column ids
+# - R[,2]: start indices
+# - R[,3]: end indices
+# If R is not provided by default all variables are assumed to be scale
+# M String --- Location to read matrix M containing the learned tree i the following format
+# - M[1,j]: id of node j (in a complete binary tree)
+# - M[2,j]: tree id
+# - M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
+# - M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
+# - M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
+# otherwise the label that leaf node j is supposed to predict
+# - M[6,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
+# stored in rows 7,8,... if j is categorical
+# If j is a leaf node: number of misclassified samples reaching at node j
+# - M[7:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[7,j]
+# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 7,8,...
+# depict the value subset chosen for j
+# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
+# C String " " Location to read the counts matrix containing the number of times samples are chosen in each tree of the random forest
+# P String --- Location to store the label predictions for X
+# A String " " Location to store the test accuracy (%) for the prediction if requested
+# OOB String " " If C is provided location to store the Out-Of-Bag (OOB) error of the learned model
+# CM String " " Location to store the confusion matrix if requested
+# fmt String "text" The output format of the output, such as "text" or "csv"
+# ---------------------------------------------------------------------------------------------
+# OUTPUT:
+# 1- Matrix Y containing the predicted labels for X
+# 2- Test accuracy if requested
+# 3- Confusion matrix C if requested
+# -------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f random-forest-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
+# A=OUTPUT_DIR/accurcay CM=OUTPUT_DIR/confusion fmt=csv
+
+fileX = $X;
+fileM = $M;
+fileP = $P;
+fileY = ifdef ($Y, " ");
+fileR = ifdef ($R, " ");
+fileC = ifdef ($C, " ");
+fileOOB = ifdef ($OOB, " ");
+fileCM = ifdef ($CM, " ");
+fileA = ifdef ($A, " ");
+fmtO = ifdef ($fmt, "text");
+X = read (fileX);
+M = read (fileM);
+
+num_records = nrow (X);
+Y_predicted = matrix (0, rows = num_records, cols = 1);
+num_trees = max (M[2,]);
+num_labels = max (M[5,]);
+num_nodes_per_tree = aggregate (target = t (M[2,]), groups = t (M[2,]), fn = "count");
+num_nodes_per_tree_cum = cumsum (num_nodes_per_tree);
+
+R_cat = matrix (0, rows = 1, cols = 1);
+R_scale = matrix (0, rows = 1, cols = 1);
+
+if (fileR != " ") {
+ R = read (fileR);
+ dummy_coded = ppred (R[,2], R[,3], "!=");
+ R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows");
+ R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
+} else { # only scale features available
+ R_scale = seq (1, ncol (X));
+}
+
+if (fileC != " ") {
+ C = read (fileC);
+ label_counts_oob = matrix (0, rows = num_records, cols = num_labels);
+}
+
+label_counts = matrix (0, rows = num_records, cols = num_labels);
+parfor (i in 1:num_records, check = 0) {
+ cur_sample = X[i,];
+ cur_node_pos = 1;
+ # cur_node = 1;
+ cur_tree = 1;
+ start_ind = 1;
+ labels_found = FALSE;
+ while (!labels_found) {
+
+ cur_feature = as.scalar (M[4,cur_node_pos]);
+ type_label = as.scalar (M[5,cur_node_pos]);
+ if (cur_feature == 0) { # leaf found
+ label_counts[i,type_label] = label_counts[i,type_label] + 1;
+ if (fileC != " ") {
+ if (as.scalar (C[i,cur_tree]) == 0) label_counts_oob[i,type_label] = label_counts_oob[i,type_label] + 1;
+ }
+ if (cur_tree < num_trees) {
+ cur_node_pos = as.scalar (num_nodes_per_tree_cum[cur_tree,]) + 1;
+ } else if (cur_tree == num_trees) {
+ labels_found = TRUE;
+ }
+ cur_tree = cur_tree + 1;
+ } else {
+ # determine type: 1 for scale, 2 for categorical
+ if (type_label == 1) { # scale feature
+ cur_start_ind = as.scalar (R_scale[cur_feature,]);
+ cur_value = as.scalar (cur_sample[,cur_start_ind]);
+ cur_split = as.scalar (M[7,cur_node_pos]);
+ if (cur_value < cur_split) { # go to left branch
+ cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]);
+ # cur_node = as.scalar (cur_M[1,cur_node_pos]);
+ } else { # go to right branch
+ cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1;
+ # cur_node = as.scalar (cur_M[1,cur_node_pos]);
+ }
+ } else if (type_label == 2) { # categorical feature
+ cur_start_ind = as.scalar (R_cat[cur_feature,1]);
+ cur_end_ind = as.scalar (R_cat[cur_feature,2]);
+ cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind]));
+ cur_offset = as.scalar (M[6,cur_node_pos]);
+ value_found = sum (ppred (M[7:(7 + cur_offset - 1),cur_node_pos], cur_value, "=="));
+ if (value_found >= 1) { # go to left branch
+ cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]);
+ # cur_node = as.scalar (cur_M[1,cur_node_pos]);
+ } else { # go to right branch
+ cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1;
+ # cur_node = as.scalar (cur_M[1,cur_node_pos]);
+ }
+
+ }
+}}}
+
+Y_predicted = rowIndexMax (label_counts);
+write (Y_predicted, fileP, format = fmtO);
+
+if (fileY != " ") {
+ Y_dummy = read (fileY);
+ num_classes = ncol (Y_dummy);
+ Y = rowSums (Y_dummy * t (seq (1, num_classes)));
+ result = ppred (Y, Y_predicted, "==");
+ result = sum (result);
+ accuracy = result / num_records * 100;
+ acc_str = "Accuracy (%): " + accuracy;
+ if (fileA != " ") {
+ write (acc_str, fileA, format = fmtO);
+ } else {
+ print (acc_str);
+ }
+ if (fileC != " ") {
+ oob_ind = ppred (rowSums (label_counts_oob), 0, ">")
+ label_counts_oob = removeEmpty (target = label_counts_oob, margin = "rows");
+ num_oob = nrow (label_counts_oob);
+ Y_predicted_oob = rowIndexMax (label_counts_oob);
+ Y_oob = removeEmpty (target = Y * oob_ind, margin = "rows");
+ result = ppred (Y_oob, Y_predicted_oob, "==");
+ oob_error = (1 - (sum (result) / num_oob)) * 100;
+ oob_str = "Out-Of-Bag error (%): " + oob_error;
+ if (fileOOB != " ") {
+ write (oob_str, fileOOB, format = fmtO);
+ } else {
+ print (oob_str);
+ }
+ }
+ if (fileCM != " ") {
+ confusion_mat = table(Y_predicted, Y, num_classes, num_classes)
+ write(confusion_mat, fileCM, format = fmtO)
+ }
+}