You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ba...@apache.org on 2021/12/28 21:31:36 UTC
[systemds] branch main updated: [DOCS] Builtin function headers
This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new fb6be02 [DOCS] Builtin function headers
fb6be02 is described below
commit fb6be02f66832f46df1d641d36c701322845c4af
Author: Atefeh Asayesh <at...@gmail.com>
AuthorDate: Tue Dec 28 21:28:50 2021 +0100
[DOCS] Builtin function headers
This commit adds documentation headers that is the same format
for the builtin functions.
Closes #1492
---
scripts/builtin/abstain.dml | 22 +++-
scripts/builtin/als.dml | 50 +++++-----
scripts/builtin/alsCG.dml | 58 ++++++-----
scripts/builtin/alsDS.dml | 39 ++++----
scripts/builtin/alsPredict.dml | 17 ++--
scripts/builtin/alsTopkPredict.dml | 29 +++---
scripts/builtin/applyAndEvaluate.dml | 28 ++++++
scripts/builtin/arima.dml | 45 +++++----
scripts/builtin/autoencoder_2layer.dml | 76 +++++++-------
scripts/builtin/bandit.dml | 32 ++++++
scripts/builtin/bivar.dml | 22 ++--
scripts/builtin/components.dml | 23 +++++
scripts/builtin/confusionMatrix.dml | 34 ++++---
scripts/builtin/cor.dml | 18 +++-
scripts/builtin/correctTypos.dml | 44 ++++----
scripts/builtin/cox.dml | 68 +++++++------
scripts/builtin/cspline.dml | 25 +++--
scripts/builtin/csplineCG.dml | 30 +++---
scripts/builtin/csplineDS.dml | 30 +++---
scripts/builtin/cvlm.dml | 27 ++++-
scripts/builtin/dbscan.dml | 16 ++-
scripts/builtin/decisionTree.dml | 50 ++++++----
scripts/builtin/deepWalk.dml | 35 ++++---
scripts/builtin/denialConstraints.dml | 114 +++++++++++----------
scripts/builtin/discoverFD.dml | 31 +++---
scripts/builtin/dist.dml | 15 +++
scripts/builtin/dmv.dml | 23 ++++-
scripts/builtin/ema.dml | 25 ++++-
scripts/builtin/executePipeline.dml | 34 +++++++
scripts/builtin/ffPredict.dml | 27 +++--
scripts/builtin/ffTrain.dml | 25 +++--
scripts/builtin/fixInvalidLengths.dml | 21 +++-
scripts/builtin/frameSort.dml | 30 +++---
scripts/builtin/garch.dml | 54 +++++-----
scripts/builtin/gaussianClassifier.dml | 38 ++++---
scripts/builtin/getAccuracy.dml | 34 +++----
scripts/builtin/glm.dml | 87 ++++++++--------
scripts/builtin/glmPredict.dml | 44 ++++----
scripts/builtin/gmm.dml | 61 ++++++------
scripts/builtin/gmmPredict.dml | 40 ++++----
scripts/builtin/gnmf.dml | 27 ++++-
scripts/builtin/gridSearch.dml | 59 ++++++-----
scripts/builtin/hospitalResidencyMatch.dml | 43 ++++----
scripts/builtin/hyperband.dml | 37 ++++++-
scripts/builtin/img_brightness.dml | 19 ++++
scripts/builtin/img_crop.dml | 20 ++++
scripts/builtin/img_cutout.dml | 34 ++++---
scripts/builtin/img_invert.dml | 25 +++--
scripts/builtin/img_mirror.dml | 17 ++++
scripts/builtin/img_posterize.dml | 28 +++---
scripts/builtin/img_rotate.dml | 28 +++---
scripts/builtin/img_sample_pairing.dml | 30 +++---
scripts/builtin/img_shear.dml | 30 +++---
scripts/builtin/img_transform.dml | 35 ++++---
scripts/builtin/img_translate.dml | 37 +++----
scripts/builtin/imputeByFD.dml | 35 ++++---
scripts/builtin/imputeByMean.dml | 34 +++----
scripts/builtin/imputeByMedian.dml | 36 +++----
scripts/builtin/imputeByMode.dml | 31 +++---
scripts/builtin/intersect.dml | 26 ++---
scripts/builtin/km.dml | 122 ++++++++++++-----------
scripts/builtin/kmeans.dml | 40 ++++----
scripts/builtin/kmeansPredict.dml | 26 ++---
scripts/builtin/knn.dml | 68 +++++++------
scripts/builtin/knnbf.dml | 27 +++--
scripts/builtin/l2svm.dml | 44 ++++----
scripts/builtin/l2svmPredict.dml | 32 +++---
scripts/builtin/lasso.dml | 31 +++---
scripts/builtin/lenetPredict.dml | 16 ++-
scripts/builtin/lenetTrain.dml | 21 ++--
scripts/builtin/lm.dml | 41 ++++----
scripts/builtin/lmCG.dml | 26 ++++-
scripts/builtin/lmDS.dml | 26 ++++-
scripts/builtin/lmPredict.dml | 21 ++++
scripts/builtin/logSumExp.dml | 37 ++++---
scripts/builtin/matrixProfile.dml | 41 ++++----
scripts/builtin/mdedup.dml | 33 +++---
scripts/builtin/mice.dml | 41 ++++----
scripts/builtin/msvm.dml | 41 ++++----
scripts/builtin/msvmPredict.dml | 28 +++---
scripts/builtin/multiLogReg.dml | 39 ++++----
scripts/builtin/multiLogRegPredict.dml | 32 +++---
scripts/builtin/na_locf.dml | 31 +++---
scripts/builtin/naiveBayes.dml | 20 ++++
scripts/builtin/naiveBayesPredict.dml | 19 ++++
scripts/builtin/normalize.dml | 25 +++--
scripts/builtin/normalizeApply.dml | 14 ++-
scripts/builtin/outlier.dml | 19 ++++
scripts/builtin/outlierByArima.dml | 54 +++++-----
scripts/builtin/outlierByIQR.dml | 44 ++++----
scripts/builtin/outlierBySd.dml | 38 ++++---
scripts/builtin/pca.dml | 37 +++----
scripts/builtin/pcaInverse.dml | 26 +++--
scripts/builtin/pcaTransform.dml | 26 +++--
scripts/builtin/pnmf.dml | 23 ++++-
scripts/builtin/ppca.dml | 38 +++----
scripts/builtin/randomForest.dml | 86 ++++++++--------
scripts/builtin/scale.dml | 31 +++---
scripts/builtin/scaleApply.dml | 27 +++--
scripts/builtin/selectByVarThresh.dml | 17 ++++
scripts/builtin/setdiff.dml | 16 +--
scripts/builtin/sherlock.dml | 92 +++++++++--------
scripts/builtin/sherlockNet.dml | 60 ++++++-----
scripts/builtin/sherlockPredict.dml | 91 +++++++++--------
scripts/builtin/shortestPath.dml | 48 +++++----
scripts/builtin/sigmoid.dml | 19 +++-
scripts/builtin/slicefinder.dml | 49 +++++----
scripts/builtin/smote.dml | 34 +++----
scripts/builtin/softmax.dml | 17 ++++
scripts/builtin/split.dml | 37 ++++---
scripts/builtin/splitBalanced.dml | 38 ++++---
scripts/builtin/stableMarriage.dml | 101 ++++++++++---------
scripts/builtin/statsNA.dml | 51 ++++++----
scripts/builtin/steplm.dml | 62 ++++++------
scripts/builtin/stratstats.dml | 120 +++++++++++-----------
scripts/builtin/symmetricDifference.dml | 17 ++--
scripts/builtin/tSNE.dml | 43 ++++----
scripts/builtin/toOneHot.dml | 33 +++---
scripts/builtin/tomeklink.dml | 32 +++---
scripts/builtin/topk_cleaning.dml | 33 ++++++
scripts/builtin/underSampling.dml | 19 +++-
scripts/builtin/union.dml | 13 ++-
scripts/builtin/unique.dml | 15 +--
scripts/builtin/univar.dml | 16 ++-
scripts/builtin/vectorToCsv.dml | 33 +++---
scripts/builtin/winsorize.dml | 20 ++++
scripts/builtin/xdummy1.dml | 17 ++++
scripts/builtin/xdummy2.dml | 18 ++++
scripts/builtin/xgboost.dml | 55 ++++++----
scripts/builtin/xgboostPredictClassification.dml | 39 ++++----
scripts/builtin/xgboostPredictRegression.dml | 35 ++++---
131 files changed, 2873 insertions(+), 1960 deletions(-)
diff --git a/scripts/builtin/abstain.dml b/scripts/builtin/abstain.dml
index 8d5ab03..3a8ea4e 100644
--- a/scripts/builtin/abstain.dml
+++ b/scripts/builtin/abstain.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,6 +19,24 @@
#
#-------------------------------------------------------------
+# This function calls the multiLogReg-function in which solves Multinomial Logistic Regression using Trust Region method
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Location to read the matrix of feature vectors
+# Y Matrix[Double] --- Location to read the matrix with category labels
+# threshold Double 0.0 ---
+# verbose Boolean FALSE flag specifying if logging information should be printed
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# abstain Matrix[Double] ---
+# ----------------------------------------------------------------------------------------------------------------------
m_abstain = function(Matrix[Double] X, Matrix[Double] Y, Double threshold, Boolean verbose = FALSE)
return (Matrix[Double] abstain)
diff --git a/scripts/builtin/als.dml b/scripts/builtin/als.dml
index f332e62..8048f2f 100644
--- a/scripts/builtin/als.dml
+++ b/scripts/builtin/als.dml
@@ -19,35 +19,39 @@
#
#-------------------------------------------------------------
-#
# This script computes an approximate factorization of a low-rank matrix X into two matrices U and V
# using different implementations of the Alternating-Least-Squares (ALS) algorithm.
# Matrices U and V are computed by minimizing a loss function (with regularization).
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Location to read the input matrix X to be factorized
-# rank Int 10 Rank of the factorization
-# reg String "L2" Regularization:
-# "L2" = L2 regularization;
-# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
-# + 0.5 * lambda * (sum (U ^ 2) + sum (V ^ 2))
-# "wL2" = weighted L2 regularization
-# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
-# + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros)
-# + sum (V ^ 2 * col_nonzeros))
-# lambda Double 0.000001 Regularization parameter, no regularization if 0.0
-# maxi Int 50 Maximum number of iterations
-# check Boolean TRUE Check for convergence after every iteration, i.e., updating U and V once
-# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
-# if the decrease in loss in any two consecutive iterations falls below this threshold;
-# if check is FALSE thr is ignored
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Location to read the input matrix X to be factorized
+# rank Integer 10 Rank of the factorization
+# reg String "L2" Regularization:
+# "L2" = L2 regularization;
+# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
+# + 0.5 * lambda * (sum (U ^ 2) + sum (V ^ 2))
+# "wL2" = weighted L2 regularization
+# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
+# + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros)
+# + sum (V ^ 2 * col_nonzeros))
+# lambda Double 0.000001 Regularization parameter, no regularization if 0.0
+# maxi Integer 50 Maximum number of iterations
+# check Boolean TRUE Check for convergence after every iteration, i.e., updating U and V once
+# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
+# if the decrease in loss in any two consecutive iterations falls below this threshold;
+# if check is FALSE thr is ignored
+# ----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# 1- An m x r matrix U, where r is the factorization rank
-# 2- An r x n matrix V
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# U Matrix An m x r matrix where r is the factorization rank
+# V Matrix An m x r matrix where r is the factorization rank
+# ----------------------------------------------------------------------------------------------------------------------
m_als = function(Matrix[Double] X, Integer rank = 10, String reg = "L2", Double lambda = 0.000001,
Integer maxi = 50, Boolean check = TRUE, Double thr = 0.0001, Boolean verbose = TRUE)
diff --git a/scripts/builtin/alsCG.dml b/scripts/builtin/alsCG.dml
index b713b93..d001c41 100644
--- a/scripts/builtin/alsCG.dml
+++ b/scripts/builtin/alsCG.dml
@@ -19,38 +19,42 @@
#
#-------------------------------------------------------------
-#
# This script computes an approximate factorization of a low-rank matrix X into two matrices U and V
# using the Alternating-Least-Squares (ALS) algorithm with conjugate gradient.
# Matrices U and V are computed by minimizing a loss function (with regularization).
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Location to read the input matrix X to be factorized
-# rank Int 10 Rank of the factorization
-# reg String "L2" Regularization:
-# "L2" = L2 regularization;
-# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
-# + 0.5 * lambda * (sum (U ^ 2) + sum (V ^ 2))
-# "wL2" = weighted L2 regularization
-# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
-# + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros)
-# + sum (V ^ 2 * col_nonzeros))
-# lambda Double 0.000001 Regularization parameter, no regularization if 0.0
-# maxi Int 50 Maximum number of iterations
-# check Boolean TRUE Check for convergence after every iteration, i.e., updating U and V once
-# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
-# if the decrease in loss in any two consecutive iterations falls below this threshold;
-# if check is FALSE thr is ignored
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- An m x r matrix U, where r is the factorization rank
-# 2- An r x n matrix V
-
-
-m_alsCG = function(Matrix[Double] X, Integer rank = 10, String reg = "L2", Double lambda = 0.000001, Integer maxi = 50, Boolean check = TRUE, Double thr = 0.0001, Boolean verbose = TRUE)
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Location to read the input matrix X to be factorized
+# rank Integer 10 Rank of the factorization
+# reg String "L2" Regularization:
+# "L2" = L2 regularization;
+# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
+# + 0.5 * lambda * (sum (U ^ 2) + sum (V ^ 2))
+# "wL2" = weighted L2 regularization
+# f (U, V) = 0.5 * sum (W * (U %*% V - X) ^ 2)
+# + 0.5 * lambda * (sum (U ^ 2 * row_nonzeros)
+# + sum (V ^ 2 * col_nonzeros))
+# lambda Double 0.000001 Regularization parameter, no regularization if 0.0
+# maxi Integer 50 Maximum number of iterations
+# check Boolean TRUE Check for convergence after every iteration, i.e., updating U and V once
+# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
+# if the decrease in loss in any two consecutive iterations falls below this threshold;
+# if check is FALSE thr is ignored
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# U Matrix[Double] An m x r matrix where r is the factorization rank
+# V Matrix[Double] An m x r matrix where r is the factorization rank
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_alsCG = function(Matrix[Double] X, Integer rank = 10, String reg = "L2", Double lambda = 0.000001, Integer maxi = 50,
+ Boolean check = TRUE, Double thr = 0.0001, Boolean verbose = TRUE)
return (Matrix[Double] U, Matrix[Double] V)
{
r = rank;
diff --git a/scripts/builtin/alsDS.dml b/scripts/builtin/alsDS.dml
index dfb88de..0b57978 100644
--- a/scripts/builtin/alsDS.dml
+++ b/scripts/builtin/alsDS.dml
@@ -19,31 +19,32 @@
#
#-------------------------------------------------------------
-#
-# Alternating-Least-Squares (ALS) algorithm using a direct solve method for
+# Alternating-Least-Squares (ALS) algorithm using a direct solve method for
# individual least squares problems (reg="L2"). This script computes an
# approximate factorization of a low-rank matrix V into two matrices L and R.
# Matrices L and R are computed by minimizing a loss function (with regularization).
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# V String --- Location to read the input matrix V to be factorized
-# L String --- Location to write the factor matrix L
-# R String --- Location to write the factor matrix R
-# rank Int 10 Rank of the factorization
-# lambda Double 0.000001 Regularization parameter, no regularization if 0.0
-# maxi Int 50 Maximum number of iterations
-# check Boolean FALSE Check for convergence after every iteration, i.e., updating L and R once
-# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
-# if the decrease in loss in any two consecutive iterations falls below this threshold;
-# if check is FALSE thr is ignored
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- An m x r matrix L, where r is the factorization rank
-# 2- An r x n matrix R
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Location to read the input matrix V to be factorized
+# rank Integer 10 Rank of the factorization
+# lambda Double 0.000001 Regularization parameter, no regularization if 0.0
+# maxi Integer 50 Maximum number of iterations
+# check Boolean FALSE Check for convergence after every iteration, i.e., updating L and R once
+# thr Double 0.0001 Assuming check is set to TRUE, the algorithm stops and convergence is declared
+# if the decrease in loss in any two consecutive iterations falls below this threshold;
+# if check is FALSE thr is ignored
+# ----------------------------------------------------------------------------------------------------------------------
#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# U Matrix[Double] An m x r matrix where r is the factorization rank
+# V Matrix[Double] An m x r matrix where r is the factorization rank
+# ----------------------------------------------------------------------------------------------------------------------
m_alsDS = function(Matrix[Double] X, Integer rank = 10, Double lambda = 0.000001,
Integer maxi = 50, Boolean check = FALSE, Double thr = 0.0001, Boolean verbose = TRUE)
diff --git a/scripts/builtin/alsPredict.dml b/scripts/builtin/alsPredict.dml
index 1cbd571..d6181f8 100644
--- a/scripts/builtin/alsPredict.dml
+++ b/scripts/builtin/alsPredict.dml
@@ -25,15 +25,20 @@
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
-# userIDs Matrix --- Column vector of user-ids (n x 1)
-# I Matrix --- Indicator matrix user-id x user-id to exclude from scoring
-# L Matrix --- The factor matrix L: user-id x feature-id
-# R Matrix --- The factor matrix R: feature-id x item-id
+# userIDs Matrix[Double] --- Column vector of user-ids (n x 1)
+# I Matrix[Double] --- Indicator matrix user-id x user-id to exclude from scoring
+# L Matrix[Double] --- The factor matrix L: user-id x feature-id
+# R Matrix[Double] --- The factor matrix R: feature-id x item-id
# ---------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# Y Matrix --- The output user-id/item-id/score
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ---------------------------------------------------------------------------------------------
+# Y Matrix[Double] The output user-id/item-id/score#
+# ---------------------------------------------------------------------------------------------
m_alsPredict = function(Matrix[Double] userIDs, Matrix[Double] I, Matrix[Double] L, Matrix[Double] R)
return (Matrix[Double] Y)
diff --git a/scripts/builtin/alsTopkPredict.dml b/scripts/builtin/alsTopkPredict.dml
index de6c9ce..90d7cd8 100644
--- a/scripts/builtin/alsTopkPredict.dml
+++ b/scripts/builtin/alsTopkPredict.dml
@@ -24,19 +24,24 @@
# at least once and all items have been rates at least once.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# userIDs Matrix --- Column vector of user-ids (n x 1)
-# I Matrix --- Indicator matrix user-id x user-id to exclude from scoring
-# L Matrix --- The factor matrix L: user-id x feature-id
-# R Matrix --- The factor matrix R: feature-id x item-id
-# K Int 5 The number of top-K items
-# ---------------------------------------------------------------------------------------------
+# -----------------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# -----------------------------------------------------------------------------------------------------------------------------
+# userIDs Matrix[Double] --- Column vector of user-ids (n x 1)
+# I Matrix[Double] --- Indicator matrix user-id x user-id to exclude from scoring
+# L Matrix[Double] --- The factor matrix L: user-id x feature-id
+# R Matrix[Double] --- The factor matrix R: feature-id x item-id
+# K Int 5 The number of top-K items
+#
+# ------------------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# TopIxs Matrix --- A matrix containing the top-K item-ids with highest predicted ratings
-# for the specified users (rows)
-# TopVals Matrix --- A matrix containing the top-K predicted ratings for the specified users (rows)
+# ------------------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ------------------------------------------------------------------------------------------------------------------------------
+# TopIxs Matrix[Double] A matrix containing the top-K item-ids with highest predicted ratings for the specified users (rows)
+# TopVals Matrix[Double] A matrix containing the top-K predicted ratings for the specified users (rows)
+# ------------------------------------------------------------------------------------------------------------------------------
m_alsTopkPredict = function(Matrix[Double] userIDs, Matrix[Double] I, Matrix[Double] L, Matrix[Double] R, Integer K = 5)
return (Matrix[Double] TopIxs, Matrix[Double] TopVals)
diff --git a/scripts/builtin/applyAndEvaluate.dml b/scripts/builtin/applyAndEvaluate.dml
index d32907f..646c718 100644
--- a/scripts/builtin/applyAndEvaluate.dml
+++ b/scripts/builtin/applyAndEvaluate.dml
@@ -19,6 +19,34 @@
#
#-------------------------------------------------------------
+# This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
+# and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
+# in terms of classification accuracy
+
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# trainData Frame[Unknown] ---
+# testData Frame[Unknown] ---
+# metaData Frame[Unknown] as.frame("NULL")
+# lp Frame[Unknown] ---
+# pip Frame[Unknown] ---
+# hp Frame[Unknown] ---
+# evaluationFunc String ---
+# evalFunHp Matrix[Double] ---
+# isLastLabel Boolean TRUE
+# correctTypos Boolean FALSE
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# result Matrix[Double] ---
+# ----------------------------------------------------------------------------------------------------------------------
+
source("scripts/pipelines/scripts/utils.dml") as utils;
source("scripts/builtin/bandit.dml") as bandit;
s_applyAndEvaluate = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
diff --git a/scripts/builtin/arima.dml b/scripts/builtin/arima.dml
index de0ebec..e1408b6 100644
--- a/scripts/builtin/arima.dml
+++ b/scripts/builtin/arima.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,27 +22,28 @@
# Builtin function that implements ARIMA
#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Double --- The input Matrix to apply Arima on.
-# max_func_invoc Int 1000 ?
-# p Int 0 non-seasonal AR order
-# d Int 0 non-seasonal differencing order
-# q Int 0 non-seasonal MA order
-# P Int 0 seasonal AR order
-# D Int 0 seasonal differencing order
-# Q Int 0 seasonal MA order
-# s Int 1 period in terms of number of time-steps
-# include_mean Boolean FALSE center to mean 0, and include in result
-# solver String jacobi solver, is either "cg" or "jacobi"
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- The input Matrix to apply Arima on.
+# max_func_invoc Int 1000
+# p Int 0 non-seasonal AR order
+# d Int 0 non-seasonal differencing order
+# q Int 0 non-seasonal MA order
+# P Int 0 seasonal AR order
+# D Int 0 seasonal differencing order
+# Q Int 0 seasonal MA order
+# s Int 1 period in terms of number of time-steps
+# include_mean Boolean FALSE center to mean 0, and include in result
+# solver String jacobi solver, is either "cg" or "jacobi"
+# ----------------------------------------------------------------------------------------------------------------------
#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# best_point String --- The calculated coefficients
-# ----------------------------------------------------------------------------
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# best_point Matrix[Double] The calculated coefficients
+# ----------------------------------------------------------------------------------------------------------------------
m_arima = function(Matrix[Double] X, Integer max_func_invoc=1000, Integer p=0,
Integer d=0, Integer q=0, Integer P=0, Integer D=0, Integer Q=0, Integer s=1,
diff --git a/scripts/builtin/autoencoder_2layer.dml b/scripts/builtin/autoencoder_2layer.dml
index 1f54604..3028f69 100644
--- a/scripts/builtin/autoencoder_2layer.dml
+++ b/scripts/builtin/autoencoder_2layer.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,46 +26,44 @@
# Also, it randomly reshuffles rows before training.
# Currently, tanh is set to be the activation function.
# By re-implementing 'func' DML-bodied function, one can change the activation.
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Filename where the input is stored
-# H1 Int --- Number of neurons in the 1st hidden layer
-# H2 Int --- Number of neurons in the 2nd hidden layer
-# EPOCH Int --- Number of epochs to train for
-# OBJ Boolean FALSE If TRUE, Computes objective function value (squared-loss)
-# at the end of each epoch. Note that, computing the full
-# objective can take a lot of time.
-# BATCH Int 256 Mini-batch size (training parameter)
-# STEP Double 1e-5 Initial step size (training parameter)
-# DECAY Double 0.95 Decays step size after each epoch (training parameter)
-# MOMENTUM Double 0.9 Momentum parameter (training parameter)
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Filename where the input is stored
+# num_hidden1 Integer --- Number of neurons in the 1st hidden layer
+# num_hidden2 Integer --- Number of neurons in the 2nd hidden layer
+# max_epochs Integer --- Number of epochs to train for
+# full_obj Boolean FALSE If TRUE, Computes objective function value (squared-loss)
+# at the end of each epoch. Note that, computing the full
+# objective can take a lot of time.
+# batch_size Integer 256 Mini-batch size (training parameter)
+# step Double 1e-5 Initial step size (training parameter)
+# decay Double 0.95 Decays step size after each epoch (training parameter)
+# mu Double 0.9 Momentum parameter (training parameter)
+# W1_rand Matrix[Double] Empty Weights might be initialized via input matrices
+# W2_rand Matrix[Double] Empty
+# W3_rand Matrix[Double] Empty
+# W4_rand Matrix[Double] Empty
+# ----------------------------------------------------------------------------------------------------------------------
#
-# W1_rand Int Rand Weights might be initialized via input matrices
-# W2_rand Int Rand
-# W3_rand Int Rand
-# W4_rand Int Rand
-# ---------------------------------------------------------------------------------------------
# OUTPUT:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# W1_out Matrix[Double] --- Matrix storing weights between input layer and 1st hidden layer
-# b1_out Matrix[Double] --- Matrix storing bias between input layer and 1st hidden layer
-# W2_out Matrix[Double] --- Matrix storing weights between 1st hidden layer and 2nd hidden layer
-# b2_out Matrix[Double] --- Matrix storing bias between 1st hidden layer and 2nd hidden layer
-# W3_out Matrix[Double] --- Matrix storing weights between 2nd hidden layer and 3rd hidden layer
-# b3_out Matrix[Double] --- Matrix storing bias between 2nd hidden layer and 3rd hidden layer
-# W4_out Matrix[Double] --- Matrix storing weights between 3rd hidden layer and output layer
-# b4_out Matrix[Double] --- Matrix storing bias between 3rd hidden layer and output layer
-# HIDDEN Matrix[Double] --- Matrix storing the hidden (2nd) layer representation if needed
-# ---------------------------------------------------------------------------------------------
-#
-
-
-m_autoencoder_2layer = function(Matrix[Double] X, Integer num_hidden1, Integer num_hidden2, Integer max_epochs,
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# W1_out Matrix[Double] Matrix storing weights between input layer and 1st hidden layer
+# b1_out Matrix[Double] Matrix storing bias between input layer and 1st hidden layer
+# W2_out Matrix[Double] Matrix storing weights between 1st hidden layer and 2nd hidden layer
+# b2_out Matrix[Double] Matrix storing bias between 1st hidden layer and 2nd hidden layer
+# W3_out Matrix[Double] Matrix storing weights between 2nd hidden layer and 3rd hidden layer
+# b3_out Matrix[Double] Matrix storing bias between 2nd hidden layer and 3rd hidden layer
+# W4_out Matrix[Double] Matrix storing weights between 3rd hidden layer and output layer
+# b4_out Matrix[Double] Matrix storing bias between 3rd hidden layer and output layer
+# HIDDEN Matrix[Double] Matrix storing the hidden (2nd) layer representation if needed
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_autoencoder_2layer = function(Matrix[Double] X, Integer num_hidden1, Integer num_hidden2, Integer max_epochs,
Boolean full_obj = FALSE, Integer batch_size = 256, Double step = 1e-5, Double decay = 0.95, Double mu = 0.9,
Matrix[Double] W1_rand = matrix(0, rows=0, cols=0), Matrix[Double] W2_rand = matrix(0, rows=0, cols=0),
Matrix[Double] W3_rand = matrix(0, rows=0, cols=0), Matrix[Double] W4_rand = matrix(0, rows=0, cols=0),
diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index c230368..74c1aeb 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -19,6 +19,38 @@
#
#-------------------------------------------------------------
+# In The bandit function the objective is to find an arm that optimises a known functional of the unknown arm-reward distributions.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X_train Matrix[Double] ---
+# Y_train Matrix[Double] ---
+# X_test Matrix[Double] ---
+# Y_test Matrix[Double] ---
+# metaList List[Unknown] ---
+# evaluationFunc String ---
+# evalFunHp Matrix[Double] ---
+# lp Frame[Unknown] ---
+# primitives Frame[Unknown] ---
+# params Frame[Unknown] ---
+# K Integer 3
+# R Integer 50
+# baseLineScore Double
+# cv Boolean
+# cvk Integer 2
+# verbose Boolean TRUE
+# output String ""
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# perf Boolean
+# ----------------------------------------------------------------------------------------------------------------------
+
m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList,
String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3,
Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Boolean verbose = TRUE, String output="")
diff --git a/scripts/builtin/bivar.dml b/scripts/builtin/bivar.dml
index e05f483..27629da 100644
--- a/scripts/builtin/bivar.dml
+++ b/scripts/builtin/bivar.dml
@@ -18,24 +18,32 @@
# under the License.
#
#-------------------------------------------------------------
-#
+
# For a given pair of attribute sets, compute bivariate statistics between all attribute pairs.
# Given, index1 = {A_11, A_12, ... A_1m} and index2 = {A_21, A_22, ... A_2n}
# compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n).
#
# INPUT PARAMETERS:
-# -------------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# -------------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# X Matrix[Double] --- Input matrix
# S1 Matrix[Integer] --- First attribute set {A_11, A_12, ... A_1m}
# S2 Matrix[Integer] --- Second attribute set {A_21, A_22, ... A_2n}
# T1 Matrix[Integer] --- Kind for attributes in S1
-# (kind=1 for scale, kind=2 for nominal, kind=3 for ordinal)
+# (kind=1 for scale, kind=2 for nominal, kind=3 for ordinal)
# verbose Boolean --- Print bivar stats
-# -------------------------------------------------------------------------------------------------
-# OUTPUT: Four matrices with bivar stats
-#-------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# basestats_scale_scale Matrix basestats_scale_scale as output with bivar stats
+# basestats_nominal_scale Matrix basestats_nominal_scale as output with bivar stats
+# basestats_nominal_nominal Matrix basestats_nominal_nominal as output with bivar stats
+# basestats_ordinal_ordinal Matrix basestats_ordinal_ordinal as output with bivar stats
+# ----------------------------------------------------------------------------------------------------------------------
m_bivar = function(Matrix[Double] X, Matrix[Double] S1, Matrix[Double] S2, Matrix[Double] T1, Matrix[Double] T2, Boolean verbose)
return (Matrix[Double] basestats_scale_scale, Matrix[Double] basestats_nominal_scale, Matrix[Double] basestats_nominal_nominal, Matrix[Double] basestats_ordinal_ordinal)
diff --git a/scripts/builtin/components.dml b/scripts/builtin/components.dml
index 5f37c07..20dafbc 100644
--- a/scripts/builtin/components.dml
+++ b/scripts/builtin/components.dml
@@ -23,6 +23,29 @@
# vector indicating the assignment of vertices to components,
# where each component is identified by the maximum vertex ID
# (i.e., row/column position of the input graph)
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix --- Location to read the matrix of feature vectors
+# Y Matrix --- Location to read the matrix with category labels
+# icpt Integer 0 Intercept presence, shifting and rescaling X columns: 0 = no intercept,
+# no shifting, no rescaling; 1 = add intercept, but neither shift nor rescale X;
+# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# tol Double 0.000001 tolerance ("epsilon")
+# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized
+# maxi Integer 100 max. number of outer (Newton) iterations
+# maxii Integer 0 max. number of inner (conjugate gradient) iterations, 0 = no max
+# verbose Boolean FALSE flag specifying if logging information should be printed
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# betas Matrix[Double] regression betas as output for prediction
+# ----------------------------------------------------------------------------------------------------------------------
m_components = function(Matrix[Double] G, Integer maxi = 0, Boolean verbose = TRUE)
return (Matrix[Double] C)
diff --git a/scripts/builtin/confusionMatrix.dml b/scripts/builtin/confusionMatrix.dml
index 9ef9bb9..9d291f5 100644
--- a/scripts/builtin/confusionMatrix.dml
+++ b/scripts/builtin/confusionMatrix.dml
@@ -19,31 +19,39 @@
#
#-------------------------------------------------------------
+# Accepts a vector for prediction and a one-hot-encoded matrix
+# Then it computes the max value of each vector and compare them
+# After which, it calculates and returns the sum of classifications
+# and the average of each true class.
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# P Double --- vector of Predictions
-# Y Double --- vector of Golden standard One Hot Encoded; the one hot encoded vector of actual labels
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# P Matrix[Double] --- vector of Predictions
+# Y Matrix[Double] --- vector of Golden standard One Hot Encoded; the one hot
+# encoded vector of actual labels
+# ----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# ConfusionSum Double --- The Confusion Matrix Sums of classifications
-# ConfusionAvg Double --- The Confusion Matrix averages of each true class
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# confusionSum Matrix[Double] The Confusion Matrix Sums of classifications
+# confusionAvg Matrix[Double] The Confusion Matrix averages of each true class
+# ----------------------------------------------------------------------------------------------------------------------
# Output is like:
# True Labels
# 1 2
# 1 TP | FP
# Predictions ----+----
# 2 FN | TN
-#
+#
# TP = True Positives
# FP = False Positives
# FN = False Negatives
# TN = True Negatives
+# ----------------------------------------------------------------------------------------------------------------------
m_confusionMatrix = function(Matrix[Double] P, Matrix[Double] Y)
return(Matrix[Double] confusionSum, Matrix[Double] confusionAvg)
diff --git a/scripts/builtin/cor.dml b/scripts/builtin/cor.dml
index ea7cf53..d6a5282 100644
--- a/scripts/builtin/cor.dml
+++ b/scripts/builtin/cor.dml
@@ -17,9 +17,25 @@
# specific language governing permissions and limitations
# under the License.
#
+#-------------------------------------------------------------
+
+# This Function compute correlation matrix
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- A Matrix Input to compute the correlation on
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Correlation matrix of the input matrix
+# ----------------------------------------------------------------------------------------------------------------------
m_cor = function(Matrix[Double] X) return (Matrix[Double] Y) {
- # compute correlation matrix in vectorized form
Xc = X - colMeans(X);
Y = ((t(Xc) %*% Xc)/(nrow(X)-1)) / (t(colSds(X)) %*% colSds(X));
}
diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
index 26f051f..cfc415b 100644
--- a/scripts/builtin/correctTypos.dml
+++ b/scripts/builtin/correctTypos.dml
@@ -19,37 +19,37 @@
#
#-------------------------------------------------------------
-# ----------------------------------------------------------------------------
+# Corrects corrupted frames of strings
+# This algorithm operates on the assumption that most strings are correct
+# and simply swaps strings that do not occur often with similar strings that
+# occur more often
+#
# References:
# Fred J. Damerau. 1964.
# A technique for computer detection and correction of spelling errors.
# Commun. ACM 7, 3 (March 1964), 171–176.
# DOI:https://doi.org/10.1145/363958.363994
-# ----------------------------------------------------------------------------
-
-# Builtin function that corrects corrupted frames of strings
-# This algorithm operates on the assumption that most strings are correct
-# and simply swaps strings that do not occur often with similar strings that
-# occur more often
#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# strings String --- The nx1 input frame of corrupted strings
-# frequency_threshold Double 0.05 Strings that occur above this frequency level will not be corrected
-# distance_threshold integer 2 Max distance at which strings are considered similar
-# decapitalize Boolean TRUE Decapitalize all strings before correction
-# correct Boolean TRUE Correct strings or only report potential errors
-# is_verbose Boolean FALSE Print debug information
+# ------------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ------------------------------------------------------------------------------------------------------------------------
+# strings Frame[String] --- The nx1 input frame of corrupted strings
+# nullMask Matrix[Double] --- ---
+# frequency_threshold Double 0.05 Strings that occur above this frequency level will not be corrected
+# distance_threshold integer 2 Max distance at which strings are considered similar
+# decapitalize Boolean TRUE Decapitalize all strings before correction
+# correct Boolean TRUE Correct strings or only report potential errors
+# is_verbose Boolean FALSE Print debug information
#
+# ------------------------------------------------------------------------------------------------------------------------
#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# Y Frame - Corrected nx1 output frame
-# ----------------------------------------------------------------------------
+# OUTPUT:
+# ------------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ------------------------------------------------------------------------------------------------------------------------
+# Y Frame[String] Corrected nx1 output frame
+# ------------------------------------------------------------------------------------------------------------------------
# TODO: future: add parameter for list of words that are sure to be correct
diff --git a/scripts/builtin/cox.dml b/scripts/builtin/cox.dml
index cee4d79..f6c9363 100644
--- a/scripts/builtin/cox.dml
+++ b/scripts/builtin/cox.dml
@@ -19,15 +19,14 @@
#
#-------------------------------------------------------------
-#
-# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
+# This script fits a cox Proportional hazard regression model.
# The Breslow method is used for handling ties and the regression parameters
# are computed using trust region newton method with conjugate gradient
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# X Matrix --- Location to read the input matrix X containing the survival data
# containing the following information
# 1: timestamps
@@ -50,36 +49,39 @@
# tol Double 0.000001 Tolerance ("epsilon")
# moi Int 100 Max. number of outer (Newton) iterations
# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
-# M[,1]: betas
-# M[,2]: exp(betas)
-# M[,3]: standard error of betas
-# M[,4]: Z
-# M[,5]: P-value
-# M[,6]: lower 100*(1-alpha)% confidence interval of betas
-# M[,7]: upper 100*(1-alpha)% confidence interval of betas
#
-# Two matrices containing a summary of some statistics of the fitted model:
-# 1 - File S with the following format
-# - row 1: no. of observations
-# - row 2: no. of events
-# - row 3: log-likelihood
-# - row 4: AIC
-# - row 5: Rsquare (Cox & Snell)
-# - row 6: max possible Rsquare
-# 2 - File T with the following format
-# - row 1: Likelihood ratio test statistic, degree of freedom, P-value
-# - row 2: Wald test statistic, degree of freedom, P-value
-# - row 3: Score (log-rank) test statistic, degree of freedom, P-value
-#
-# Additionally, the following matrices are stored (needed for prediction)
-# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
-# 2- Matrix XO which is matrix X with sorted timestamps
-# 3- Variance-covariance matrix of the betas COV
-# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# M Matrix[Double] A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
+# M[,1]: betas
+# M[,2]: exp(betas)
+# M[,3]: standard error of betas
+# M[,4]: Z
+# M[,5]: P-value
+# M[,6]: lower 100*(1-alpha)% confidence interval of betas
+# M[,7]: upper 100*(1-alpha)% confidence interval of betas
+# S,T Matrix[Double] Two matrices containing a summary of some statistics of the fitted model:
+# 1 - File S with the following format
+# - row 1: no. of observations
+# - row 2: no. of events
+# - row 3: log-likelihood
+# - row 4: AIC
+# - row 5: Rsquare (Cox & Snell)
+# - row 6: max possible Rsquare
+# 2 - File T with the following format
+# - row 1: Likelihood ratio test statistic, degree of freedom, P-value
+# - row 2: Wald test statistic, degree of freedom, P-value
+# - row 3: Score (log-rank) test statistic, degree of freedom, P-value
+# RT,XO,COV Matrix[Double] Additionally, the following matrices are stored (needed for prediction)
+# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
+# 2- Matrix XO which is matrix X with sorted timestamps
+# 3- Variance-covariance matrix of the betas COV
+# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
+# ----------------------------------------------------------------------------------------------------------------------
m_cox = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] F, Matrix[Double] R,
Double alpha = 0.05, Double tol = 0.000001, Int moi = 100, Int mii = 0)
diff --git a/scripts/builtin/cspline.dml b/scripts/builtin/cspline.dml
index 2a697da..f3d376e 100644
--- a/scripts/builtin/cspline.dml
+++ b/scripts/builtin/cspline.dml
@@ -18,13 +18,16 @@
# under the License.
#
#-------------------------------------------------------------
+
+# Solves Cubic Spline Interpolation
#
-# THIS SCRIPT SOLVES CUBIC SPLINE INTERPOLATION
+# Algorithms: implement https://en.wikipedia.org/wiki/Spline_interpolation#Algorithm_to_find_the_interpolating_cubic_spline
+# It use natural spline with q1''(x0) == qn''(xn) == 0.0
#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# X Matrix[Double] --- 1-column matrix of x values knots. It is assumed that x values are
# monotonically increasing and there is no duplicates points in X
# Y Matrix[Double] --- 1-column matrix of corresponding y values knots
@@ -33,15 +36,15 @@
# tol Double -1.0 Tolerance (epsilon); conjugate graduent procedure terminates early if
# L2 norm of the beta-residual is less than tolerance * its initial norm
# maxi Integer -1 Maximum number of conjugate gradient iterations, 0 = no maximum
-# --------------------------------------------------------------------------------------------
-# OUTPUT:
-# pred_Y Matrix[Double] --- Predicted value
-# K Matrix[Double] --- Matrix of k parameters
+# ----------------------------------------------------------------------------------------------------------------------
#
-
-#Algorithms: It implement the https://en.wikipedia.org/wiki/Spline_interpolation#Algorithm_to_find_the_interpolating_cubic_spline
-#it usages natural spline with q1''(x0) == qn''(xn) == 0.0
-
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# pred_Y Matrix[Double] Predicted value
+# K Matrix[Double] Matrix of k parameters
+# ----------------------------------------------------------------------------------------------------------------------
m_cspline = function(Matrix[Double] X, Matrix[Double] Y, Double inp_x,
String mode = "DS", Double tol = -1.0, Integer maxi = -1)
diff --git a/scripts/builtin/csplineCG.dml b/scripts/builtin/csplineCG.dml
index f88afe1..422a8ea 100644
--- a/scripts/builtin/csplineCG.dml
+++ b/scripts/builtin/csplineCG.dml
@@ -18,13 +18,13 @@
# under the License.
#
#-------------------------------------------------------------
-#
-# THIS SCRIPT SOLVES CUBIC SPLINE INTERPOLATION USING THE CONJUGATE GRADIENT ALGORITHM
+
+# Builtin that solves cubic spline interpolation using conjucate gradient algorithm
#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# X Matrix[Double] --- 1-column matrix of x values knots. It is assumed that x values are
# monotonically increasing and there is no duplicates points in X
# Y Matrix[Double] --- 1-column matrix of corresponding y values knots
@@ -32,19 +32,17 @@
# tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if
# L2 norm of the beta-residual is less than tolerance * its initial norm
# maxi Integer 0 Maximum number of conjugate gradient iterations, 0 = no maximum
-# --------------------------------------------------------------------------------------------
-# OUTPUT:
-# pred_Y Matrix[Double] --- Predicted value
-# K Matrix[Double] --- Matrix of k parameters
+# ----------------------------------------------------------------------------------------------------------------------
#
-
-#Algorithms: It implement the https://en.wikipedia.org/wiki/Spline_interpolation#Algorithm_to_find_the_interpolating_cubic_spline
-#it usages natural spline with q1''(x0) == qn''(xn) == 0.0
-
-
-m_csplineCG =function (Matrix[Double] X, Matrix[Double] Y, Double inp_x,
- Double tol = 0.000001, Integer maxi = 0
- )
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# pred_Y Matrix[Double] Predicted value
+# K Matrix[Double] Matrix of k parameters
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_csplineCG = function (Matrix[Double] X, Matrix[Double] Y, Double inp_x, Double tol = 0.000001, Integer maxi = 0)
return (Matrix[Double] pred_Y, Matrix[Double] K)
{
K = calcKnotsDerivKsCG(X, Y, maxi, tol)
diff --git a/scripts/builtin/csplineDS.dml b/scripts/builtin/csplineDS.dml
index 9a6d13c..87ff58e 100644
--- a/scripts/builtin/csplineDS.dml
+++ b/scripts/builtin/csplineDS.dml
@@ -18,27 +18,27 @@
# under the License.
#
#-------------------------------------------------------------
-#
-# THIS SCRIPT SOLVES CUBIC SPLINE INTERPOLATION USING THE DIRECT SOLVER
+
+# Builtin that solves cubic spline interpolation using a direct solver.
#
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
-# X Matrix[Double] --- 1-column matrix of x values knots. It is assumed that x values are
-# monotonically increasing and there is no duplicates points in X
-# Y Matrix[Double] --- 1-column matrix of corresponding y values knots
-# inp_x Double --- the given input x, for which the cspline will find predicted y.
+# X Matrix[Double] --- 1-column matrix of x values knots. It is assumed that x values are
+# monotonically increasing and there is no duplicates points in X
+# Y Matrix[Double] --- 1-column matrix of corresponding y values knots
+# inp_x Double --- the given input x, for which the cspline will find predicted y.
#
-# --------------------------------------------------------------------------------------------
-# OUTPUT:
-# pred_y Matrix[Double] --- Predicted value
-# K Matrix[Double] --- Matrix of k parameters
+# ----------------------------------------------------------------------------------------------------------------------
#
-
-#Algorithms: It implement the https://en.wikipedia.org/wiki/Spline_interpolation#Algorithm_to_find_the_interpolating_cubic_spline
-#it usages natural spline with q1''(x0) == qn''(xn) == 0.0
-
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# pred_y Matrix[Double] Predicted value
+# K Matrix[Double] Matrix of k parameters
+# ----------------------------------------------------------------------------------------------------------------------
m_csplineDS = function (Matrix[Double] X, Matrix[Double] Y, Double inp_x)
return (Matrix[Double] pred_Y, Matrix[Double] K)
diff --git a/scripts/builtin/cvlm.dml b/scripts/builtin/cvlm.dml
index 6604456..acf2cae 100644
--- a/scripts/builtin/cvlm.dml
+++ b/scripts/builtin/cvlm.dml
@@ -19,7 +19,32 @@
#
#-------------------------------------------------------------
-m_cvlm = function(Matrix[Double] X, Matrix[Double] y, Integer k, Integer icpt = 0, Double reg = 1e-7) return (Matrix[Double] y_predict, Matrix[Double] allbeta)
+# The cvlm-function is used for cross-validation of the provided data model. This function follows a non-exhaustive cross
+# validation method. It uses lm and lmPredict functions to solve the linear regression and to predict the class of a
+# feature vector with no intercept, shifting, and rescaling.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Recorded Data set into matrix
+# y Matrix[Double] --- 1-column matrix of response values.
+# k Integer --- Number of subsets needed, It should always be more than 1 and less than nrow(X)
+# icpt Integer 0 Intercept presence, shifting and rescaling the columns of X
+# reg Double 1e-7 Regularization constant (lambda) for L2-regularization. set to nonzero for
+# highly dependant/sparse/numerous features
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# y_predict Matrix[Double] Response values
+# allbeta Matrix[Double] Validated data set
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_cvlm = function(Matrix[Double] X, Matrix[Double] y, Integer k, Integer icpt = 0, Double reg = 1e-7)
+ return (Matrix[Double] y_predict, Matrix[Double] allbeta)
{
M = nrow(X);
lim = floor(as.integer(M/k));
diff --git a/scripts/builtin/dbscan.dml b/scripts/builtin/dbscan.dml
index ea8c6fd..4eb6f83 100644
--- a/scripts/builtin/dbscan.dml
+++ b/scripts/builtin/dbscan.dml
@@ -18,17 +18,25 @@
# under the License.
#
#-------------------------------------------------------------
-#
+
# Implements the DBSCAN clustering algorithm using Euclidian distance matrix
#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ----------------------------------------------------------------------------
# X Matrix[Double] --- The input Matrix to do DBSCAN on.
# eps Double 0.5 Maximum distance between two points for one to be considered reachable for the other.
-# minPts Int 5 Number of points in a neighborhood for a point to be considered as a core point (includes the point itself).
+# minPts Int 5 Number of points in a neighborhood for a point to be considered as a core point
+# (includes the point itself).
+# ----------------------------------------------------------------------------------------------------------------------
#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# clusterMembers Matrix[Double] clustering Matrix
+# ----------------------------------------------------------------------------------------------------------------------
m_dbscan = function (Matrix[Double] X, Double eps = 0.5, Integer minPts = 5)
return (Matrix[Double] clusterMembers)
@@ -69,4 +77,4 @@ m_dbscan = function (Matrix[Double] X, Double eps = 0.5, Integer minPts = 5)
# noise to 0
clusterMembers = clusterMembers * (rowSums(adjacency) > 0);
}
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/decisionTree.dml b/scripts/builtin/decisionTree.dml
index ac9f82b..e2f1e5a 100644
--- a/scripts/builtin/decisionTree.dml
+++ b/scripts/builtin/decisionTree.dml
@@ -19,37 +19,45 @@
#
#-------------------------------------------------------------
-#
-# THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES
+# Builtin script implementing classification trees with scale and categorical features
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded
# Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded
# R Matrix[Double] " " Matrix R which for each feature in X contains the following information
-# - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates
-# a scalar feature vector, other positive Integers indicate the number of categories
-# If R is not provided by default all variables are assumed to be scale
+# - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates
+# a scalar feature vector, other positive Integers indicate the number of categories
+# If R is not provided by default all variables are assumed to be scale
# bins Integer 20 Number of equiheight bins per scale feature to choose thresholds
# depth Integer 25 Maximum depth of the learned tree
# verbose Boolean FALSE boolean specifying if the algorithm should print information while executing
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information:
-# M[1,j]: id of node j (in a complete binary tree)
-# M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-# M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical)
-# that node j looks at if j is an internal node, otherwise 0
-# M[4,j]: Type of the feature that node j looks at if j is an internal node: holds the same information as R input vector
-# M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
-# stored in rows 6,7,... if j is categorical
-# If j is a leaf node: number of misclassified samples reaching at node j
-# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale,
-# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
-# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# M Matrix[Double] Matrix M where each column corresponds to a node in the learned tree and each row
+# contains the following information:
+# M[1,j]: id of node j (in a complete binary tree)
+# M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
+# M[3,j]: Feature index of the feature (scale feature id if the feature is scale or
+# categorical feature id if the feature is categorical)
+# that node j looks at if j is an internal node, otherwise 0
+# M[4,j]: Type of the feature that node j looks at if j is an internal node: holds
+# the same information as R input vector
+# M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale,
+# otherwise the size of the subset of values
+# stored in rows 6,7,... if j is categorical
+# If j is a leaf node: number of misclassified samples reaching at node j
+# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared
+# to is stored at M[6,j] if the feature chosen for j is scale,
+# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
+# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
+# ----------------------------------------------------------------------------------------------------------------------
m_decisionTree = function(
Matrix[Double] X,
diff --git a/scripts/builtin/deepWalk.dml b/scripts/builtin/deepWalk.dml
index 338ddaa..447ee6e 100644
--- a/scripts/builtin/deepWalk.dml
+++ b/scripts/builtin/deepWalk.dml
@@ -20,23 +20,26 @@
#-------------------------------------------------------------
# This script performs DeepWalk on a given graph (https://arxiv.org/pdf/1403.6652.pdf)
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Graph Matrix --- adjacency matrix of a graph (n x n)
-# w Integer --- window size
-# d Integer --- embedding size
-# gamma Integer --- walks per vertex
-# t Integer --- walk length
-# alpha Double 0.025 learning rate
-# beta Double 0.9 factor for decreasing learning rate
-# ---------------------------------------------------------------------------------------------
-# OUTPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Phi Matrix --- matrix of vertex/word representation (n x d)
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Graph Matrix[Double] --- adjacency matrix of a graph (n x n)
+# w Integer --- window size
+# d Integer --- embedding size
+# gamma Integer --- walks per vertex
+# t Integer --- walk length
+# alpha Double 0.025 learning rate
+# beta Double 0.9 factor for decreasing learning rate
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Phi Matrix[Double] matrix of vertex/word representation (n x d)
+# ----------------------------------------------------------------------------------------------------------------------
source("scripts/staging/entity-resolution/primitives/postprocessing.dml") as post;
diff --git a/scripts/builtin/denialConstraints.dml b/scripts/builtin/denialConstraints.dml
index da6d3f6..3d221ab 100644
--- a/scripts/builtin/denialConstraints.dml
+++ b/scripts/builtin/denialConstraints.dml
@@ -18,75 +18,73 @@
# under the License.
#
#-------------------------------------------------------------
-#
+
# This function considers some constraints indicating statements that can NOT happen in the data (denial constraints).
-#
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# dataFrame Frame --- frame which columns represent the variables of the data and the rows correspond to different tuples or instances.
-# Recommended to have a column indexing the instances from 1 to N (N=number of instances).
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# dataFrame Frame --- frame which columns represent the variables of the data and the rows correspond
+# to different tuples or instances.
+# Recommended to have a column indexing the instances from 1 to N (N=number of instances).
# constraintsFrame Frame --- frame with fixed columns and each row representing one constraint.
# 1. idx: (double) index of the constraint, from 1 to M (number of constraints)
-# 2. constraint.type: (string) The constraints can be of 3 different kinds:
-# - variableCompare: for each instance, it will compare the values of two variables (with a relation <, > or =).
-# - valueCompare: for each instance, it will compare a fixed value and a variable value (with a relation <, > or =).
-# - instanceCompare: for every couple of instances, it will compare the relation between two variables,
-# ie if the value of the variable 1 in instance 1 is lower/higher than the value of variable 1 in instance 2,
-# then the value of of variable 2 in instance 2 can't be lower/higher than the value of variable 2 in instance 2.
-# 3. group.by: (boolean) if TRUE only one group of data (defined by a variable option) will be considered for the constraint.
-# 4. group.variable: (string, only if group.by TRUE) name of the variable (column in dataFrame) that will divide our data in groups.
-# 5. group.option: (only if group.by TRUE) option of the group.variable that defines the group to consider.
-# 6. variable1: (string) first variable to compare (name of column in dataFrame).
-# 7. relation: (string) can be < , > or = in the case of variableCompare and valueCompare, and < >, < < , > < or > >
+# 2. constraint.type: (string) The constraints can be of 3 different kinds:
+# - variableCompare: for each instance, it will compare the values of two variables (with a relation <, > or =).
+# - valueCompare: for each instance, it will compare a fixed value and a variable value (with a relation <, > or =).
+# - instanceCompare: for every couple of instances, it will compare the relation between two variables,
+# ie if the value of the variable 1 in instance 1 is lower/higher than the value of variable 1 in instance 2,
+# then the value of of variable 2 in instance 2 can't be lower/higher than the value of variable 2 in instance 2.
+# 3. group.by: (boolean) if TRUE only one group of data (defined by a variable option) will be considered for the constraint.
+# 4. group.variable: (string, only if group.by TRUE) name of the variable (column in dataFrame) that will divide our data in groups.
+# 5. group.option: (only if group.by TRUE) option of the group.variable that defines the group to consider.
+# 6. variable1: (string) first variable to compare (name of column in dataFrame).
+# 7. relation: (string) can be < , > or = in the case of variableCompare and valueCompare, and < >, < < , > < or > >
# in the case of instanceCompare
-# 8. variable2: (string) second variable to compare (name of column in dataFrame) or fixed value for the case of valueCompare.
-#
-
-# -----------------------
-# EXAMPLE:
-# dataFrame:
-#
-# rank discipline yrs.since.phd yrs.service sex salary
-# 1 Prof B 19 18 Male 139750
-# 2 Prof B 20 16 Male 173200
-# 3 AsstProf B 3 3 Male 79750.56
-# 4 Prof B 45 39 Male 115000
-# 5 Prof B 40 40 Male 141500
-# 6 AssocProf B 6 6 Male 97000
-# 7 Prof B 30 23 Male 175000
-# 8 Prof B 45 45 Male 147765
-# 9 Prof B 21 20 Male 119250
-# 10 Prof B 18 18 Female 129000
-# 11 AssocProf B 12 8 Male 119800
-# 12 AsstProf B 7 2 Male 79800
-# 13 AsstProf B 1 1 Male 77700
+# 8. variable2: (string) second variable to compare (name of column in dataFrame) or fixed value for the case of valueCompare.
+#
+# ----------------------------------------------------------------------------------------------------------------------
+# EXAMPLE:
+# dataFrame:
+#
+# rank discipline yrs.since.phd yrs.service sex salary
+# 1 Prof B 19 18 Male 139750
+# 2 Prof B 20 16 Male 173200
+# 3 AsstProf B 3 3 Male 79750.56
+# 4 Prof B 45 39 Male 115000
+# 5 Prof B 40 40 Male 141500
+# 6 AssocProf B 6 6 Male 97000
+# 7 Prof B 30 23 Male 175000
+# 8 Prof B 45 45 Male 147765
+# 9 Prof B 21 20 Male 119250
+# 10 Prof B 18 18 Female 129000
+# 11 AssocProf B 12 8 Male 119800
+# 12 AsstProf B 7 2 Male 79800
+# 13 AsstProf B 1 1 Male 77700
#
# constraintsFrame:
-#
-# idx constraint.type group.by group.variable group.option variable1 relation variable2
-# 1 variableCompare FALSE yrs.since.phd < yrs.service
-# 2 instanceCompare TRUE rank Prof yrs.service >< salary
-# 3 valueCompare FALSE salary = 78182
-# 4 variableCompare TRUE discipline B yrs.service > yrs.since.phd
+#
+# idx constraint.type group.by group.variable group.option variable1 relation variable2
+# 1 variableCompare FALSE yrs.since.phd < yrs.service
+# 2 instanceCompare TRUE rank Prof yrs.service >< salary
+# 3 valueCompare FALSE salary = 78182
+# 4 variableCompare TRUE discipline B yrs.service > yrs.since.phd
#
#
# Example: explanation of constraint 2 --> it can't happen that one professor of rank Prof has more years of service than other, but lower salary.
#
-#----------------------------------
-# OUTPUT PARAMETERS:
-#----------------------------------
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------
-# WrongInstances Matrix Double Matrix of 2 columns.
-# - First column shows the indexes of dataFrame that are wrong.
-# - Second column shows the index of the denial constraint that is fulfilled
-# If there are no wrong instances to show (0 constrains fulfilled) --> WrongInstances=matrix(0,1,2)
+# ----------------------------------------------------------------------------------------------------------------------
#
-
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# WrongInstances Matrix[double] Matrix of 2 columns.
+# - First column shows the indexes of dataFrame that are wrong.
+# - Second column shows the index of the denial constraint that is fulfilled
+# If there are no wrong instances to show (0 constrains fulfilled) --> WrongInstances=matrix(0,1,2)
+# ----------------------------------------------------------------------------------------------------------------------
s_denialConstraints = function(Frame[Unknown] dataFrame, Frame[Unknown] constraintsFrame)
return(Matrix[double] WrongInstances)
@@ -186,7 +184,7 @@ return(Matrix[double] WrongInstances)
WrongInstances[flag,2] = iConstraint-1
}
}
- }
+ }
# CONSTRAINT TO COMPARE A VALUE AND A VARIABLE FOR EACH iNSTANCE
@@ -247,7 +245,7 @@ return(Matrix[double] WrongInstances)
flag3=flag3+1
DataMatrix[flag3,1] = as.matrix(dataFrame[iInstance,1]) # InstanceIdx
DataMatrix[flag3,2] = as.matrix(dataFrame[iInstance,colIdx1])
- DataMatrix[flag3,3] = as.matrix(dataFrame[iInstance,colIdx2])
+ DataMatrix[flag3,3] = as.matrix(dataFrame[iInstance,colIdx2])
}
}
DataMatrix=DataMatrix[1:flag3,]
diff --git a/scripts/builtin/discoverFD.dml b/scripts/builtin/discoverFD.dml
index dbc4fbf..a1a4044 100644
--- a/scripts/builtin/discoverFD.dml
+++ b/scripts/builtin/discoverFD.dml
@@ -22,21 +22,22 @@
# Implements builtin for finding functional dependencies
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double -- Input Matrix X, encoded Matrix if data is categorical
-# Mask Double -- A row vector for interested features i.e. Mask =[1, 0, 1]
- # will exclude the second column from processing
-# threshold Double -- threshold value in interval [0, 1] for robust FDs
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# FD Double --- matrix of functional dependencies
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] -- Input Matrix X, encoded Matrix if data is categorical
+# Mask Matrix[Double] -- A row vector for interested features i.e. Mask =[1, 0, 1]
+# will exclude the second column from processing
+# threshold Matrix[Double] -- threshold value in interval [0, 1] for robust FDs
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# FD Matrix[Double] matrix of functional dependencies
+# ----------------------------------------------------------------------------------------------------------------------
m_discoverFD = function(Matrix[Double] X, Matrix[Double] Mask, Double threshold)
return(Matrix[Double] FD)
diff --git a/scripts/builtin/dist.dml b/scripts/builtin/dist.dml
index 54db438..136efbd 100644
--- a/scripts/builtin/dist.dml
+++ b/scripts/builtin/dist.dml
@@ -17,8 +17,23 @@
# specific language governing permissions and limitations
# under the License.
#
+#-------------------------------------------------------------
# Returns Euclidian distance matrix (distances between N n-dimensional points)
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- ---
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Euclidian distance matrix
+# ----------------------------------------------------------------------------------------------------------------------
m_dist = function(Matrix[Double] X) return (Matrix[Double] Y) {
G = X %*% t(X);
diff --git a/scripts/builtin/dmv.dml b/scripts/builtin/dmv.dml
index af68f1f..77528b7 100644
--- a/scripts/builtin/dmv.dml
+++ b/scripts/builtin/dmv.dml
@@ -17,10 +17,29 @@
# specific language governing permissions and limitations
# under the License.
#
-#------------------------------------------------------------
+#-------------------------------------------------------------
-s_dmv = function(Frame[String] X, Double threshold=0.8, String replace="NA") return (Frame[String] Y) {
+# The dmv-function is used to find disguised missing values utilising syntactical pattern recognition.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Frame[String] --- Input Frame
+# threshold Double 0.8 Threshold value in interval [0, 1] for dominant pattern per column (e.g., 0.8 means
+# that 80% of the entries per column must adhere this pattern to be dominant)
+# replace String "NA" The string disguised missing values are replaced with
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Frame[String] Frame X including detected disguised missing values
+# ----------------------------------------------------------------------------------------------------------------------
+s_dmv = function(Frame[String] X, Double threshold=0.8, String replace="NA") return (Frame[String] Y) {
if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1] found " + threshold)
diff --git a/scripts/builtin/ema.dml b/scripts/builtin/ema.dml
index 87c0639..508c146 100644
--- a/scripts/builtin/ema.dml
+++ b/scripts/builtin/ema.dml
@@ -17,7 +17,30 @@
# specific language governing permissions and limitations
# under the License.
#
-#------------------------------------------------------------
+#-------------------------------------------------------------
+
+# This function imputes values with exponential moving average (single, double or triple).
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Frame[Double] --- Frame that contains timeseries data that needs to be imputed
+# search_iterations Integer -- Budget iterations for parameter optimisation,
+# used if parameters weren't set
+# mode String --- Type of EMA method. Either "single", "double" or "triple"
+# freq Double --- Seasonality when using triple EMA.
+# alpha Double --- alpha- value for EMA
+# beta Double --- beta- value for EMA
+# gamma Double --- gamma- value for EMA
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# M Frame[Double] Frame with EMA results
+# ----------------------------------------------------------------------------------------------------------------------
# TODO: convert to DML builtin using cumsumprod(data, alpha)
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index 4c3bea9..3a0358a 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -19,6 +19,40 @@
#
#-------------------------------------------------------------
+# This function execute pipeline.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# logical Frame[String] NULL ---
+# pipeline Frame[String] --- ---
+# X Matrix[Double] --- ---
+# Y Matrix[Double] --- ---
+# Xtest Matrix[Double] --- ---
+# Ytest Matrix[Double] --- ---
+# metaList List[Unknown] --- ---
+# hyperParameters Matrix[Double] --- ---
+# hpForPruning Matrix[Double] 0 ---
+# changesByOp Matrix[Double] 0 ---
+# flagsCount Integer --- ---
+# test Boolean FALSE ---
+# verbose Boolean --- ---
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] ---
+# Y Matrix[Double] ---
+# Xtest Matrix[Double] ---
+# Ytest Matrix[Double] ---
+# t2 Double ---
+# hpForPruning Matrix[Double] ---
+# changesByOp Matrix[Double] ---
+# ----------------------------------------------------------------------------------------------------------------------
+
s_executePipeline = function(Frame[String] logical = as.frame("NULL"), Frame[String] pipeline, Matrix[Double] X, Matrix[Double] Y,
Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose)
diff --git a/scripts/builtin/ffPredict.dml b/scripts/builtin/ffPredict.dml
index 2290bf8..98a21de 100644
--- a/scripts/builtin/ffPredict.dml
+++ b/scripts/builtin/ffPredict.dml
@@ -22,21 +22,26 @@
# This builtin function makes prediction given data and trained feedforward neural network model
#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# Model List[unknown] --- Trained ff neural network model
-# X Matrix[Double] --- Data used for making predictions
-# batch_size Integer 128 Batch size
-# --------------------------------------------------------------------------------------------
-# OUTPUT:
-# pred Matrix[Double] --- Predicted value
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Model List[unknown] --- Trained ff neural network model
+# X Matrix[Double] --- Data used for making predictions
+# batch_size Integer 128 Batch size
#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# pred Double Predicted value
+# ----------------------------------------------------------------------------------------------------------------------
source("nn/layers/feedForward.dml") as ff_pass
-s_ffPredict = function(List[unknown] model, Matrix[double] X, Integer batch_size = 128)
- return (Matrix[double] pred) {
+s_ffPredict = function(List[unknown] model, Matrix[Double] X, Integer batch_size = 128)
+ return (Matrix[Double] pred) {
rows = nrow(X)
out = as.matrix(model["W2"])
diff --git a/scripts/builtin/ffTrain.dml b/scripts/builtin/ffTrain.dml
index ad9d2cb..3515385 100644
--- a/scripts/builtin/ffTrain.dml
+++ b/scripts/builtin/ffTrain.dml
@@ -18,31 +18,36 @@
# under the License.
#
#-------------------------------------------------------------
-#
+
# This builtin function trains simple feed-forward neural network. The architecture of the
# networks is: affine1 -> relu -> dropout -> affine2 -> configurable output activation function.
+# Hidden layer has 128 neurons. Dropout rate is 0.35. Input and ouptut sizes are inferred from X and Y.
#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X Matrix[double] --- Training data
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Training data
# Y Matrix[Double] --- Labels/Target values
# batch_size Integer 64 Batch size
# epochs Integer 20 Number of epochs
# learning_rate Double 0.003 Learning rate
# out_activation String --- User specified ouptut activation function. Possible values:
-# "sigmoid", "relu", "lrelu", "tanh", "softmax", "logits" (no activation).
+# "sigmoid", "relu", "lrelu", "tanh", "softmax", "logits" (no activation).
# loss_fcn String --- User specified loss function. Possible values:
-# "l1", "l2", "log_loss", "logcosh_loss", "cel" (cross-entropy loss).
+# "l1", "l2", "log_loss", "logcosh_loss", "cel" (cross-entropy loss).
# shuffle Boolean FALSE Flag which indicates if dataset should be shuffled or not
# validation_split Double 0.0 Fraction of training set used as validation set
# seed Integer -1 Seed for model initialization
# verbose Boolean FALSE Flag which indicates if function should print to stdout
-# --------------------------------------------------------------------------------------------
-# OUTPUT:
-# model List[unknown] --- Trained model which can be used in ffPredict
+# ----------------------------------------------------------------------------------------------------------------------
#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# model List[unknown] Trained model which can be used in ffPredict
+# ----------------------------------------------------------------------------------------------------------------------
source("nn/layers/affine.dml") as affine
source("nn/layers/dropout.dml") as dropout
diff --git a/scripts/builtin/fixInvalidLengths.dml b/scripts/builtin/fixInvalidLengths.dml
index d674c93..7894239 100644
--- a/scripts/builtin/fixInvalidLengths.dml
+++ b/scripts/builtin/fixInvalidLengths.dml
@@ -19,10 +19,29 @@
#
#-------------------------------------------------------------
+# Fix invalid lenghts
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# F1 Frame[Unknown] ---
+# mask Matrix[Double] ---
+# ql Double 0.05
+# qu Double 0.99
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# out Frame[Unknown] ---
+# M Matrix[Double] ---
+# ----------------------------------------------------------------------------------------------------------------------
+
s_fixInvalidLengths = function(Frame[Unknown] F1, Matrix[Double] mask, Double ql = 0.05, Double qu = 0.99)
return (Frame[Unknown] out, Matrix[Double] M)
{
-
length = map(F1, "x -> x.length()")
length = as.matrix(length)
length = replace(target = (length * mask), pattern = NaN, replacement = 0)
diff --git a/scripts/builtin/frameSort.dml b/scripts/builtin/frameSort.dml
index 1a62f9f..aaf6cd3 100644
--- a/scripts/builtin/frameSort.dml
+++ b/scripts/builtin/frameSort.dml
@@ -21,22 +21,20 @@
# Related to [SYSTEMDS-2662] dependency function for cleaning pipelines
# Built-in for sorting frames
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# F String --- Data frame of string values
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# f_odered String --- sorted dataset by column 1 in decreasing order
-
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# F Frame[String] --- Data frame of string values
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+#----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# f_odered Frame[String] sorted dataset by column 1 in decreasing order
+# ----------------------------------------------------------------------------------------------------------------------
s_frameSort = function(Frame[String] F, Boolean orderDesc = TRUE )
return (Frame[String] f_odered)
@@ -49,4 +47,4 @@ return (Frame[String] f_odered)
[X, M] = transformencode(target=F, spec=jspecR);
ordered = order(target = X, by = 1, decreasing=orderDesc, index.return=FALSE)
f_odered = transformdecode(target=ordered, spec=jspecR, meta=M);
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/garch.dml b/scripts/builtin/garch.dml
index a37ca8b..b523b59 100644
--- a/scripts/builtin/garch.dml
+++ b/scripts/builtin/garch.dml
@@ -19,42 +19,42 @@
#
#-------------------------------------------------------------
-# Builtin function that implements GARCH(1,1)
+# This is a builtin function that implements GARCH(1,1), a statistical model used in analyzing time-series data where the variance
+# error is believed to be serially autocorrelated
#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Double --- The input Matrix to apply Arima on.
-# kmax Integer --- Number of iterations
-# momentum Double --- Momentum for momentum-gradient descent (set to 0 to deactivate)
-# start_stepsize Double --- Initial gradient-descent stepsize
-# end_stepsize Double --- gradient-descent stepsize at end (linear descent)
-# start_vicinity Double --- proportion of randomness of restart-location for gradient descent at beginning
-# end_vicinity Double --- same at end (linear decay)
-# sim_seed Integer --- seed for simulation of process on fitted coefficients
-# verbose Boolean --- verbosity, comments during fitting
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- The input Matrix to apply Arima on.
+# kmax Integer --- Number of iterations
+# momentum Double --- Momentum for momentum-gradient descent (set to 0 to deactivate)
+# start_stepsize Double --- Initial gradient-descent stepsize
+# end_stepsize Double --- gradient-descent stepsize at end (linear descent)
+# start_vicinity Double --- proportion of randomness of restart-location for gradient descent at beginning
+# end_vicinity Double --- same at end (linear decay)
+# sim_seed Integer --- seed for simulation of process on fitted coefficients
+# verbose Boolean --- verbosity, comments during fitting
+# ----------------------------------------------------------------------------------------------------------------------
#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# fitted_X Double --- simulated garch(1,1) process on fitted coefficients
-# fitted_var_hist Double --- variances of simulated fitted process
-# best_a0 Double --- constant term of fitted process
-# best_arch_coef Double --- 1-st arch-coefficient of fitted process
-# best_var_coef Double --- 1-st garch-coefficient of fitted process
-# ----------------------------------------------------------------------------
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# fitted_X Matrix[Double] simulated garch(1,1) process on fitted coefficients
+# fitted_var_hist Matrix[Double] variances of simulated fitted process
+# best_a0 Double onstant term of fitted process
+# best_arch_coef Double 1-st arch-coefficient of fitted process
+# best_var_coef Double 1-st garch-coefficient of fitted process
+# ----------------------------------------------------------------------------------------------------------------------
#
# COMMENTS
# This has some drawbacks: slow convergence of optimization (sort of simulated annealing/gradient descent)
# TODO: use BFGS or BHHH if it is available (this are go to methods)
# TODO: (only then) extend to garch(p,q); otherwise the search space is way too big for the current method
-
-# ------- MAIN FUNCTION
-
-m_garch = function(Matrix[Double] X, Integer kmax, Double momentum, Double start_stepsize, Double end_stepsize, Double start_vicinity, Double end_vicinity, Integer sim_seed, Boolean verbose)
+m_garch = function(Matrix[Double] X, Integer kmax, Double momentum, Double start_stepsize, Double end_stepsize, Double start_vicinity,
+ Double end_vicinity, Integer sim_seed, Boolean verbose)
return (Matrix[Double] fitted_X, Matrix[Double] fitted_var_hist, Double best_a0, Double best_arch_coef, Double best_var_coef) {
[a0, arch_coef, var_coef] = sample_feasible_params() # initialize startpoint
diff --git a/scripts/builtin/gaussianClassifier.dml b/scripts/builtin/gaussianClassifier.dml
index c17db9f..bfdbfc5 100644
--- a/scripts/builtin/gaussianClassifier.dml
+++ b/scripts/builtin/gaussianClassifier.dml
@@ -18,7 +18,7 @@
# under the License.
#
#-------------------------------------------------------------
-#
+
# Computes the parameters needed for Gaussian Classification.
# Thus it computes the following per class: the prior probability,
# the inverse covariance matrix, the mean per feature and the determinant
@@ -26,27 +26,25 @@
# adds some small smoothing value along the variances, to prevent
# numerical errors / instabilities.
#
-#
# INPUT PARAMETERS:
-# -------------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# -------------------------------------------------------------------------------------------------
-# D Matrix[Double] --- Input matrix (training set)
-# C Matrix[Double] --- Target vector
-# varSmoothing Double 1e-9 Smoothing factor for variances
-# verbose Boolean TRUE Print accuracy of the training set
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# classPriors Matrix[Double] --- Vector storing the class prior probabilities
-# classMeans Matrix[Double] --- Matrix storing the means of the classes
-# classInvCovariances List[Unknown] --- List of inverse covariance matrices
-# determinants Matrix[Double] --- Vector storing the determinants of the classes
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# D Matrix[Double] --- Input matrix (training set)
+# C Matrix[Double] --- Target vector
+# varSmoothing Double 1e-9 Smoothing factor for variances
+# verbose Boolean TRUE Print accuracy of the training set
+# ----------------------------------------------------------------------------------------------------------------------
#
-
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# classPriors Matrix[Double] Vector storing the class prior probabilities
+# classMeans Matrix[Double] Matrix storing the means of the classes
+# classInvCovariances List[Unknown] List of inverse covariance matrices
+# determinants Matrix[Double] Vector storing the determinants of the classes
+# ----------------------------------------------------------------------------------------------------------------------
m_gaussianClassifier = function(Matrix[Double] D, Matrix[Double] C, Double varSmoothing=1e-9, Boolean verbose = TRUE)
return (Matrix[Double] classPriors, Matrix[Double] classMeans,
diff --git a/scripts/builtin/getAccuracy.dml b/scripts/builtin/getAccuracy.dml
index c3c2abb..5e38534 100644
--- a/scripts/builtin/getAccuracy.dml
+++ b/scripts/builtin/getAccuracy.dml
@@ -19,25 +19,23 @@
#
#-------------------------------------------------------------
-# compute the weighted and simple accuracy for given predictions
-
+# This builtin function compute the weighted and simple accuracy for given predictions
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# y Double --- Ground truth (Actual Labels)
-# yhat Double --- predictions (Predicted labels)
-# isWeighted Boolean FALSE flag for weighted or non-weighted accuracy calculation
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# accuracy Double --- accuracy of the predicted labels
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# y Matrix[Double] --- Ground truth (Actual Labels)
+# yhat Matrix[Double] --- Predictions (Predicted labels)
+# isWeighted Boolean FALSE Flag for weighted or non-weighted accuracy calculation
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# accuracy Double accuracy of the predicted labels
+# ----------------------------------------------------------------------------------------------------------------------
m_getAccuracy = function(Matrix[Double] y, Matrix[Double] yhat, Boolean isWeighted = FALSE)
return (Double accuracy)
@@ -61,4 +59,4 @@ return (Double accuracy)
classes = replace(target = classes, pattern = 0, replacement = 1)
accuracy = mean(colSums(pred)/t(classes)) * 100
}
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/glm.dml b/scripts/builtin/glm.dml
index e2b5255..cec5975 100644
--- a/scripts/builtin/glm.dml
+++ b/scripts/builtin/glm.dml
@@ -19,47 +19,49 @@
#
#-------------------------------------------------------------
-#
-# THIS BUILTIN SOLVES GLM REGRESSION USING NEWTON/FISHER SCORING WITH TRUST REGIONS
+# This script solves GLM regression using NEWTON/FISHER scoring with trust regions. The glm-function is a flexible
+# generalization of ordinary linear regression that allows for response variables that have error distribution models.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- matrix X of feature vectors
-# Y Double --- matrix Y with either 1 or 2 columns:
-# if dfam = 2, Y is 1-column Bernoulli or 2-column Binomial (#pos, #neg)
-# dfam Int 1 Distribution family code: 1 = Power, 2 = Binomial
-# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1):
-# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian
-# link Int 0 Link function code: 0 = canonical (depends on distribution),
-# 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit
-# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1):
-# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity
-# yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0
-# icpt Int 0 Intercept presence, X columns shifting and rescaling:
-# 0 = no intercept, no shifting, no rescaling;
-# 1 = add intercept, but neither shift nor rescale X;
-# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# reg Double 0.0 Regularization parameter (lambda) for L2 regularization
-# tol Double 0.000001 Tolerance (epsilon)
-# disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data
-# moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations
-# mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix X of feature vectors
+# Y Matrix[Double] --- matrix Y with either 1 or 2 columns:
+# if dfam = 2, Y is 1-column Bernoulli or 2-column Binomial (#pos, #neg)
+# dfam Int 1 Distribution family code: 1 = Power, 2 = Binomial
+# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1):
+# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian
+# link Int 0 Link function code: 0 = canonical (depends on distribution),
+# 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit
+# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1):
+# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity
+# yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0
+# icpt Int 0 Intercept presence, X columns shifting and rescaling:
+# 0 = no intercept, no shifting, no rescaling;
+# 1 = add intercept, but neither shift nor rescale X;
+# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# reg Double 0.0 Regularization parameter (lambda) for L2 regularization
+# tol Double 0.000001 Tolerance (epsilon)
+# disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data
+# moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations
+# mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum
+#
+# ----------------------------------------------------------------------------------------------------------------------
#
-# OTUPUT:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE MEANING
-# ---------------------------------------------------------------------------------------------
-# beta Double Matrix beta, whose size depends on icpt:
-# icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2
-#----------------------------------------------------------------------------------------------
-# In addition, some GLM statistics are provided as console output by setting verbose = TRUE, one comma-separated name-value
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# beta Matrix[Double] Matrix beta, whose size depends on icpt:
+# icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2
+#-----------------------------------------------------------------------------------------------------------------------
+#
+# In addition, some GLM statistics are provided as console output by setting verbose=TRUE, one comma-separated name-value
# pair per each line, as follows:
#
# NAME MEANING
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# TERMINATION_CODE A positive integer indicating success/failure as follows:
# 1 = Converged successfully; 2 = Maximum number of iterations reached;
# 3 = Input (X, Y) out of range; 4 = Distribution/link is not supported
@@ -73,13 +75,13 @@
# DISPERSION_EST Dispersion estimated from the dataset
# DEVIANCE_UNSCALED Deviance from the saturated model, assuming dispersion == 1.0
# DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
#
# The Log file, when requested, contains the following per-iteration variables in CSV format,
# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values:
#
# NAME MEANING
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration
# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise
# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. "beta") to new point
@@ -92,15 +94,14 @@
# LINEAR_TERM_MAX The maximum value of X %*% beta, used to check for overflows
# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored
# TRUST_DELTA Updated trust region size, the "delta"
-# -------------------------------------------------------------------------------------------
-#
+# ----------------------------------------------------------------------------------------------------------------------
#
# SOME OF THE SUPPORTED GLM DISTRIBUTION FAMILIES
# AND LINK FUNCTIONS:
-# -----------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# INPUT PARAMETERS: MEANING: Cano-
# dfam vpow link lpow Distribution.link nical?
-# -----------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# 1 0.0 1 -1.0 Gaussian.inverse
# 1 0.0 1 0.0 Gaussian.log
# 1 0.0 1 1.0 Gaussian.id Yes
@@ -115,14 +116,14 @@
# 1 3.0 1 0.0 InvGaussian.log
# 1 3.0 1 1.0 InvGaussian.id
# 1 * 1 * AnyVariance.AnyLink
-# -----------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# 2 * 1 0.0 Binomial.log
# 2 * 1 0.5 Binomial.sqrt
# 2 * 2 * Binomial.logit Yes
# 2 * 3 * Binomial.probit
# 2 * 4 * Binomial.cloglog
# 2 * 5 * Binomial.cauchit
-# -----------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
m_glm = function(Matrix[Double] X, Matrix[Double] Y, Integer dfam=1,
Double vpow=0.0, Integer link=0, Double lpow=1.0, Double yneg=0.0,
diff --git a/scripts/builtin/glmPredict.dml b/scripts/builtin/glmPredict.dml
index 46b193d..484fdf4 100644
--- a/scripts/builtin/glmPredict.dml
+++ b/scripts/builtin/glmPredict.dml
@@ -20,34 +20,36 @@
#-------------------------------------------------------------
# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A GLM-TYPE REGRESSION TO A NEW (TEST) DATASET
-
+#
# INPUTS PARAMETERS:
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
-# X Matrix --- Matrix X of records (feature vectors)
-# B Matrix --- GLM regression parameters (the betas), with dimensions
-# ncol(X) x k: do not add intercept
-# ncol(X)+1 x k: add intercept as given by the last B-row
-# if k > 1, use only B[, 1] unless it is Multinomial Logit (dfam=3)
-# ytest Matrix " " Response matrix Y, with the following dimensions:
-# nrow(X) x 1 : for all distributions (dfam=1 or 2 or 3)
-# nrow(X) x 2 : for Binomial (dfam=2) given by (#pos, #neg) counts
-# nrow(X) x k+1: for Multinomial (dfam=3) given by category counts
-# dfam Int 1 GLM distribution family: 1 = Power, 2 = Binomial, 3 = Multinomial Logit
-# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1):
-# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian
-# link Int 0 Link function code: 0 = canonical (depends on distribution), 1 = Power,
-# 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; ignored if Multinomial
-# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1):
-# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity
-# disp Double 1.0 Dispersion value, when available
-# verbose Boolean TRUE Print statistics to stdout
+# X Matrix[Double] --- Matrix X of records (feature vectors)
+# B Matrix[Double] --- GLM regression parameters (the betas), with dimensions
+# ncol(X) x k: do not add intercept
+# ncol(X)+1 x k: add intercept as given by the last B-row
+# if k > 1, use only B[, 1] unless it is Multinomial Logit (dfam=3)
+# ytest Matrix[Double] " " Response matrix Y, with the following dimensions:
+# nrow(X) x 1 : for all distributions (dfam=1 or 2 or 3)
+# nrow(X) x 2 : for Binomial (dfam=2) given by (#pos, #neg) counts
+# nrow(X) x k+1: for Multinomial (dfam=3) given by category counts
+# dfam Int 1 GLM distribution family: 1 = Power, 2 = Binomial, 3 = Multinomial Logit
+# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1):
+# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian
+# link Int 0 Link function code: 0 = canonical (depends on distribution), 1 = Power,
+# 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; ignored if Multinomial
+# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1):
+# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity
+# disp Double 1.0 Dispersion value, when available
+# verbose Boolean TRUE Print statistics to stdout
# ---------------------------------------------------------------------------------------------
# OUTPUTS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
# ---------------------------------------------------------------------------------------------
-# M Matrix " " Matrix M of predicted means/probabilities:
+# M Matrix[Double] Matrix M of predicted means/probabilities:
# nrow(X) x 1 : for Power-type distributions (dfam=1)
# nrow(X) x 2 : for Binomial distribution (dfam=2), column 2 is "No"
# nrow(X) x k+1: for Multinomial Logit (dfam=3), col# k+1 is baseline
diff --git a/scripts/builtin/gmm.dml b/scripts/builtin/gmm.dml
index a4530c4..62e9931 100644
--- a/scripts/builtin/gmm.dml
+++ b/scripts/builtin/gmm.dml
@@ -19,38 +19,39 @@
#
#-------------------------------------------------------------
-# ------------------------------------------
-# Gaussian Mixture Model
-# ------------------------------------------
-
+# The gmm-function implements builtin Gaussian Mixture Model with four different types of covariance matrices
+# i.e., VVV, EEE, VVI, VII and two initialization methods namely "kmeans" and "random".
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix X
-# n_components Integer 3 Number of n_components in the Gaussian mixture model
-# model String "VVV" "VVV": unequal variance (full),each component has its own general covariance matrix
-# "EEE": equal variance (tied), all components share the same general covariance matrix
-# "VVI": spherical, unequal volume (diag), each component has its own diagonal
-# covariance matrix
-# "VII": spherical, equal volume (spherical), each component has its own single variance
-# init_param String "kmeans" initialize weights with "kmeans" or "random"
-# iterations Integer 100 Number of iterations
-# reg_covar Double 1e-6 regularization parameter for covariance matrix
-# tol Double 0.000001 tolerance value for convergence
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# weight Double --- A matrix whose [i,k]th entry is the probability that observation i in the test data
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X
+# n_components Integer 3 Number of n_components in the Gaussian mixture model
+# model String "VVV" "VVV": unequal variance (full),each component has its own general covariance matrix
+# "EEE": equal variance (tied), all components share the same general covariance matrix
+# "VVI": spherical, unequal volume (diag), each component has its own diagonal
+# covariance matrix
+# "VII": spherical, equal volume (spherical), each component has its own single variance
+# init_param String "kmeans" initialize weights with "kmeans" or "random"
+# iterations Integer 100 Number of iterations
+# reg_covar Double 1e-6 regularization parameter for covariance matrix
+# tol Double 0.000001 tolerance value for convergence
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# labels Matrix[Double] Prediction matrix
+# predict_prob Matrix[Double] Probability of the predictions
+# df Integer Number of estimated parameters
+# bic Double Bayesian information criterion for best iteration
+# mu Matrix[Double] fitted clusters mean
+# weight Matrix[Double] A matrix whose [i,k]th entry is the probability that observation i in the test data
# belongs to the kth class
-# labels Double --- Prediction matrix
-# df Integer --- Number of estimated parameters
-# bic Double --- Bayesian information criterion for best iteration
-
+# ----------------------------------------------------------------------------------------------------------------------
m_gmm = function(Matrix[Double] X, Integer n_components = 3, String model = "VVV", String init_params = "kmeans",
Integer iter = 100, Double reg_covar = 1e-6, Double tol = 0.000001, Integer seed = -1, Boolean verbose = FALSE )
diff --git a/scripts/builtin/gmmPredict.dml b/scripts/builtin/gmmPredict.dml
index 21a897b..54bc4db 100644
--- a/scripts/builtin/gmmPredict.dml
+++ b/scripts/builtin/gmmPredict.dml
@@ -18,29 +18,29 @@
# under the License.
#
#-------------------------------------------------------------
-# ------------------------------------------
-# Gaussian Mixture Model Predict
-# ------------------------------------------
+# This function is a Prediction function for a Gaussian Mixture Model (gmm).
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix X (instances to be clustered)
-# weight Double --- Weight of learned model
-# mu Double --- fitted clusters mean
-# precisions_cholesky Double --- fitted precision matrix for each mixture
-# model String --- fitted model
-# ---------------------------------------------------------------------------------------------
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X (instances to be clustered)
+# weight Matrix[Double] --- Weight of learned model
+# mu Matrix[Double] --- fitted clusters mean
+# precisions_cholesky Matrix[Double] --- fitted precision matrix for each mixture
+# model String --- fitted model
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# predict Double --- predicted cluster labels
-# posterior_prob Double --- probabilities of belongingness
-# ---------------------------------------------------------------------------------------------
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# predict Double predicted cluster labels
+# posterior_prob Double probabilities of belongingness
+# ----------------------------------------------------------------------------------------------------------------------
+#
# compute posterior probabilities for new instances given the variance and mean of fitted data
m_gmmPredict = function(Matrix[Double] X, Matrix[Double] weight,
diff --git a/scripts/builtin/gnmf.dml b/scripts/builtin/gnmf.dml
index 3a3acb9..18dad21 100644
--- a/scripts/builtin/gnmf.dml
+++ b/scripts/builtin/gnmf.dml
@@ -19,13 +19,34 @@
#
#-------------------------------------------------------------
-# Implements Gaussian Nonnegative Matrix Factorization (GNMF)
+# The gnmf-function does Gaussian Non-Negative Matrix Factorization. In this, a matrix X is factorized into two
+# matrices W and H, such that all three matrices have no negative elements. This non-negativity makes the resulting
+# matrices easier to inspect.
#
+# References:
# [Chao Liu, Hung-chih Yang, Jinliang Fan, Li-Wei He, Yi-Min Wang:
-# Distributed nonnegative matrix factorization for web-scale dyadic
+# Distributed nonnegative matrix factorization for web-scale dyadic
# data analysis on mapreduce. WWW 2010: 681-690]
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors.
+# rnk Integer --- Number of components into which matrix X is to be factored
+# eps Double 1e-8 Tolerance
+# maxi Integer 10 Maximum number of conjugate gradient iterations
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# W Matrix[Double] List of pattern matrices, one for each repetition
+# H Matrix[Double] List of amplitude matrices, one for each repetition
+# ----------------------------------------------------------------------------------------------------------------------
-m_gnmf = function(Matrix[Double] X, Integer rnk, Double eps = 1e-8, Integer maxi = 10)
+m_gnmf = function(Matrix[Double] X, Integer rnk, Double eps = 1e-8, Integer maxi = 10)
return (Matrix[Double] W, Matrix[Double] H)
{
#initialize W and H
diff --git a/scripts/builtin/gridSearch.dml b/scripts/builtin/gridSearch.dml
index 1c0568d..eab7756 100644
--- a/scripts/builtin/gridSearch.dml
+++ b/scripts/builtin/gridSearch.dml
@@ -19,30 +19,41 @@
#
#-------------------------------------------------------------
-
-#-------------------------------------------------------------------------------
-# X Input feature matrix
-# y Input label vector (or matrix)
-# train Name ft of the train function to call via ft(trainArgs)
-# predict Name fp of the loss function to call via fp((predictArgs,B))
-# numB Maximum number of parameters in model B (pass the maximum
-# because the size of B may vary with parameters like icpt
-# params List of varied hyper-parameter names
-# paramValues List of matrices providing the parameter values as
-# columnvectors for position-aligned hyper-parameters in 'params'
-# trainArgs named List of arguments to pass to the 'train' function, where
-# gridSearch replaces enumerated hyper-parameter by name, if
-# not provided or an empty list, the lm parameters are used
-# predictArgs List of arguments to pass to the 'predict' function, where
-# gridSearch appends the trained models at the end, if
-# not provided or an empty list, list(X, y) is used instead
-# cv flag enabling k-fold cross validation, otherwise training loss
-# cvk if cv=TRUE, specifies the the number of folds, otherwise ignored
-# verbose flag for verbose debug output
-#-------------------------------------------------------------------------------
-# B the trained model with minimal loss (by the 'predict' function)
-# opt one-row frame w/ optimal hyperparameters (by 'params' position)
-#-------------------------------------------------------------------------------
+# The gridSearch-function is used to find the optimal hyper-parameters of a model which results in the most
+# accurate predictions. This function takes train and eval functions by name.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# y Matrix[Double] --- Input Matrix of vectors.
+# train String --- Name ft of the train function to call via ft(trainArgs)
+# predict String --- Name fp of the loss function to call via fp((predictArgs,B))
+# numB Integer --- Maximum number of parameters in model B (pass the maximum because the
+# size of B may vary with parameters like icpt
+# params List[String] --- List of varied hyper-parameter names
+# paramValues List[Unknown] --- List of matrices providing the parameter values as
+# columnvectors for position-aligned hyper-parameters in 'params'
+# trainArgs List[Unknown] --- named List of arguments to pass to the 'train' function, where
+# gridSearch replaces enumerated hyper-parameter by name, if
+# not provided or an empty list, the lm parameters are used
+# predictArgs List[Unknown] --- List of arguments to pass to the 'predict' function, where
+# gridSearch appends the trained models at the end, if
+# not provided or an empty list, list(X, y) is used instead
+# cv Boolean FALSE flag enabling k-fold cross validation, otherwise training loss
+# cvk Integet 5 if cv=TRUE, specifies the the number of folds, otherwise ignored
+# verbose Boolean TRUE flag for verbose debug output
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# B Matrix[Double] Matrix[Double]the trained model with minimal loss (by the 'predict' function)
+# opt Matrix[Double] one-row frame w/ optimal hyperparameters (by 'params' position)
+#-----------------------------------------------------------------------------------------------------------------------
m_gridSearch = function(Matrix[Double] X, Matrix[Double] y, String train, String predict,
Integer numB=ncol(X), List[String] params, List[Unknown] paramValues,
diff --git a/scripts/builtin/hospitalResidencyMatch.dml b/scripts/builtin/hospitalResidencyMatch.dml
index 94eb931..0d5c640 100644
--- a/scripts/builtin/hospitalResidencyMatch.dml
+++ b/scripts/builtin/hospitalResidencyMatch.dml
@@ -1,5 +1,6 @@
#-------------------------------------------------------------
-## Licensed to the Apache Software Foundation (ASF) under one
+#
+# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
@@ -15,38 +16,38 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-##-------------------------------------------------------------
-# THIS SCRIPT COMPUTES A SOLUTION FOR THE HOSPITAL RESIDENCY MATCH PROBLEM
+#
+#-------------------------------------------------------------
+
+# This script computes a solution for the hospital residency match problem.
#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# R Matrix --- Residents matrix R.
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# R Matrix[Double] --- Residents matrix R.
# It must be an ORDERED matrix.
-#
-# H Matrix --- Hospitals matrix H.
+# H Matrix[Double] --- Hospitals matrix H.
# It must be an UNORDRED matrix.
-#
-# capacity Matrix --- capacity of Hospitals matrix C.
+# capacity Matrix[Double] --- capacity of Hospitals matrix C.
# It must be a [n*1] matrix with non zero values.
# i.e. the leftmost value in a row is the most preferred partner's index.
# i.e. the leftmost value in a row in P is the preference value for the acceptor
# with index 1 and vice-versa (higher is better).
-# OUTPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# residencyMatch Matrix --- Result Matrix
+# verbose Boolean False If the operation is verbose
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# residencyMatch Matrix[Double] Result Matrix
# If cell [i,j] is non-zero, it means that Resident i has matched with Hospital j.
# Further, if cell [i,j] is non-zero, it holds the preference value that led to the match.
-#
-#
-# hospitalMatch Matrix --- Result Matrix
+# hospitalMatch Matrix[Double] Result Matrix
# If cell [i,j] is non-zero, it means that Resident i has matched with Hospital j.
# Further, if cell [i,j] is non-zero, it holds the preference value that led to the match.
#
#
+# ----------------------------------------------------------------------------------------------------------------------
# Residents.mtx:
# 2.0,1.0,3.0
# 1.0,2.0,3.0
@@ -73,8 +74,6 @@
# 1.0,0.0,0.0
# 0.0,2.0,0.0
#
-
-
# hospitalMatch.mtx
# 0.0,1.0,0.0
# 0.0,0.0,2.0
@@ -83,7 +82,7 @@
# Resident 1 has matched with Hospital 3 (since [1,3] is non-zero) at a preference level of 2.0.
# Resident 2 has matched with Hospital 1 (since [2,1] is non-zero) at a preference level of 1.0.
# Resident 3 has matched with Hospital 2 (since [3,2] is non-zero) at a preference level of 2.0.
-# --------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
m_hospitalResidencyMatch = function(Matrix[Double] R, Matrix[Double] H, Matrix[Double] capacity, Boolean verbose = FALSE)
return (Matrix[Double] residencyMatch, Matrix[Double] hospitalMatch)
diff --git a/scripts/builtin/hyperband.dml b/scripts/builtin/hyperband.dml
index fd4754d..6830c7f 100644
--- a/scripts/builtin/hyperband.dml
+++ b/scripts/builtin/hyperband.dml
@@ -10,7 +10,7 @@
#
# http://www.apache.org/licenses/LICENSE-2.0
#
-# Unless required by applicable law or agreed to in writing,
+# Unless --- by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
@@ -19,7 +19,40 @@
#
#-------------------------------------------------------------
-m_hyperband = function(Matrix[Double] X_train, Matrix[Double] y_train,
+# The hyperband-function is used for hyper parameter optimization and is based on multi-armed bandits and early
+# elimination. Through multiple parallel brackets and consecutive trials it will return the hyper parameter combination
+# which performed best on a validation dataset. A set of hyper parameter combinations is drawn from uniform distributions
+# with given ranges; Those make up the candidates for hyperband. Notes:
+# hyperband is hard-coded for lmCG, and uses lmPredict for validation
+# hyperband is hard-coded to use the number of iterations as a resource
+# hyperband can only optimize continuous hyperparameters
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X_train Matrix[Double] --- Input Matrix of training vectors
+# y_train Matrix[Double] --- Labels for training vectors
+# X_val Matrix[Double] --- Input Matrix of validation vectors
+# y_val Matrix[Double] --- Labels for validation vectors
+# params List[String] --- List of parameters to optimize
+# paramRanges Matrix[Double] --- The min and max values for the uniform distributions to draw from.
+# One row per hyper parameter, first column specifies min, second column max value.
+# R Scalar[int] 81 Controls number of candidates evaluated
+# eta Scalar[int] 3 Determines fraction of candidates to keep after each trial
+# verbose Boolean TRUE If TRUE print messages are activated
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# bestWeights Matrix[Double] 1-column matrix of weights of best performing candidate
+# bestHyperParams Frame[Unknown] hyper parameters of best performing candidate
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_hyperband = function(Matrix[Double] X_train, Matrix[Double] y_train,
Matrix[Double] X_val, Matrix[Double] y_val, List[String] params,
Matrix[Double] paramRanges, Scalar[int] R = 81, Scalar[int] eta = 3,
Boolean verbose = TRUE)
diff --git a/scripts/builtin/img_brightness.dml b/scripts/builtin/img_brightness.dml
index 839c19a..60a14c2 100644
--- a/scripts/builtin/img_brightness.dml
+++ b/scripts/builtin/img_brightness.dml
@@ -19,6 +19,25 @@
#
#-------------------------------------------------------------
+# The img_brightness-function is an image data augumentation function. It changes the brightness of the image.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input matrix/image
+# value Double --- The amount of brightness to be changed for the image
+# channel_max Integer --- Maximum value of the brightness of the image
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output matrix/image
+# ----------------------------------------------------------------------------------------------------------------------
+
m_img_brightness = function(Matrix[Double] img_in, Double value, Integer channel_max) return (Matrix[Double] img_out) {
# change the brightness of an image
img_out = max(0, min(img_in + value, channel_max))
diff --git a/scripts/builtin/img_crop.dml b/scripts/builtin/img_crop.dml
index 68291ca..e2ee34a 100644
--- a/scripts/builtin/img_crop.dml
+++ b/scripts/builtin/img_crop.dml
@@ -19,6 +19,26 @@
#
#-------------------------------------------------------------
+# The img_crop-function is an image data augumentation function. It cuts out a subregion of an image.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input matrix/image
+# w Integer --- The width of the subregion required
+# h Integer --- The height of the subregion required
+# x_offset Integer --- The horizontal coordinate in the image to begin the crop operation
+# y_offset Integer --- The vertical coordinate in the image to begin the crop operation
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Cropped matrix/image
+# ----------------------------------------------------------------------------------------------------------------------
+
m_img_crop = function(Matrix[Double] img_in, Integer w, Integer h, Integer x_offset, Integer y_offset) return (Matrix[Double] img_out) {
# crop - cut out a subregion of an image. Adapted from image_utils.dml
orig_w = ncol(img_in)
diff --git a/scripts/builtin/img_cutout.dml b/scripts/builtin/img_cutout.dml
index d9f5128..b9042fd 100644
--- a/scripts/builtin/img_cutout.dml
+++ b/scripts/builtin/img_cutout.dml
@@ -19,24 +19,26 @@
#
#-------------------------------------------------------------
-# Image Cutout
-# Replaces a rectangular section of an image with a constant value.
+# Image Cutout function replaces a rectangular section of an image with a constant value.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in Matrix --- Input image as 2D matrix with top left corner at [1, 1]
-# x Int --- Column index of the top left corner of the rectangle (starting at 1)
-# y Int --- Row index of the top left corner of the rectangle (starting at 1)
-# width Int --- Width of the rectangle (must be positive)
-# height Int --- Height of the rectangle (must be positive)
-# fill_value Double --- The value to set for the rectangle
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image as 2D matrix with top left corner at [1, 1]
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input image as 2D matrix with top left corner at [1, 1]
+# x Int --- Column index of the top left corner of the rectangle (starting at 1)
+# y Int --- Row index of the top left corner of the rectangle (starting at 1)
+# width Int --- Width of the rectangle (must be positive)
+# height Int --- Height of the rectangle (must be positive)
+# fill_value Double --- The value to set for the rectangle
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output image as 2D matrix with top left corner at [1, 1]
+# ----------------------------------------------------------------------------------------------------------------------
m_img_cutout = function(Matrix[Double] img_in, Integer x, Integer y, Integer width, Integer height, Double fill_value) return (Matrix[Double] img_out) {
rows = nrow(img_in)
diff --git a/scripts/builtin/img_invert.dml b/scripts/builtin/img_invert.dml
index 2fcbe14..b243cfa 100644
--- a/scripts/builtin/img_invert.dml
+++ b/scripts/builtin/img_invert.dml
@@ -19,19 +19,22 @@
#
#-------------------------------------------------------------
-# Invert Image
+# This is an image data augumentation function. It inverts an image.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in Matrix --- Input image
-# max_value Double --- The maximum value pixels can have
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input image
+# max_value Double --- The maximum value pixels can have
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output image
+# ----------------------------------------------------------------------------------------------------------------------
m_img_invert = function(Matrix[Double] img_in, Double max_value) return (Matrix[Double] img_out) {
img_out = max_value - img_in
diff --git a/scripts/builtin/img_mirror.dml b/scripts/builtin/img_mirror.dml
index 9aaedb5..0431110 100644
--- a/scripts/builtin/img_mirror.dml
+++ b/scripts/builtin/img_mirror.dml
@@ -19,6 +19,23 @@
#
#-------------------------------------------------------------
+# This function is an image data augumentation function. It flips an image on the X (horizontal) or Y (vertical) axis.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input matrix/image
+# max_value Double --- The maximum value pixels can have
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Flipped matrix/image
+# ----------------------------------------------------------------------------------------------------------------------
+
m_img_mirror = function(Matrix[Double] img_in, Boolean horizontal_axis) return (Matrix[Double] img_out) {
# flip an image on the x (horizontal) or y (vertical) axis
if( horizontal_axis)
diff --git a/scripts/builtin/img_posterize.dml b/scripts/builtin/img_posterize.dml
index 360d0ad..8953191 100644
--- a/scripts/builtin/img_posterize.dml
+++ b/scripts/builtin/img_posterize.dml
@@ -19,22 +19,24 @@
#
#-------------------------------------------------------------
-# Image Posterize
-# Limit pixel values to 2^bits different values in the range [0, 255].
+# The Image Posterize function limits pixel values to 2^bits different values in the range [0, 255].
# Assumes the input image can attain values in the range [0, 255].
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in Matrix --- Input image
-# bits Int --- The number of bits keep for the values.
-# 1 means black and white, 8 means every integer between 0 and 255.
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input image
+# bits Int --- The number of bits keep for the values.
+# 1 means black and white, 8 means every integer between 0 and 255.
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output image
+# ----------------------------------------------------------------------------------------------------------------------
m_img_posterize = function(Matrix[Double] img_in, Integer bits) return (Matrix[Double] img_out) {
img_out = (img_in %/% 2^(8 - bits)) * (2^(8 - bits))
diff --git a/scripts/builtin/img_rotate.dml b/scripts/builtin/img_rotate.dml
index 58a1220..a06737c 100644
--- a/scripts/builtin/img_rotate.dml
+++ b/scripts/builtin/img_rotate.dml
@@ -19,22 +19,24 @@
#
#-------------------------------------------------------------
-# Image Rotate
-# Rotates the input image counter-clockwise around the center.
+# The Image Rotate function rotates the input image counter-clockwise around the center.
# Uses nearest neighbor sampling.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in Matrix --- Input image as 2D matrix with top left corner at [1, 1]
-# radians Double --- The value by which to rotate in radian.
-# fill_value Double --- The background color revealed by the rotation
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image as 2D matrix with top left corner at [1, 1]
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input image as 2D matrix with top left corner at [1, 1]
+# radians Double --- The value by which to rotate in radian.
+# fill_value Double --- The background color revealed by the rotation
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output image as 2D matrix with top left corner at [1, 1]
+# ----------------------------------------------------------------------------------------------------------------------
m_img_rotate = function(Matrix[Double] img_in, Double radians, Double fill_value) return (Matrix[Double] img_out) {
# Translation matrix for moving the origin to the center of the image
diff --git a/scripts/builtin/img_sample_pairing.dml b/scripts/builtin/img_sample_pairing.dml
index beaa92c..949dc10 100644
--- a/scripts/builtin/img_sample_pairing.dml
+++ b/scripts/builtin/img_sample_pairing.dml
@@ -19,22 +19,24 @@
#
#-------------------------------------------------------------
-# Image Sample Pairing
-# Blends two images together
+# The image sample pairing function blends two images together.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in1 Matrix --- First input image
-# img_in2 Matrix --- Second input image
-# weight Double --- The weight given to the second image.
-# 0 means only img_in1, 1 means only img_in2 will be visible
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in1 Matrix[Double] --- First input image
+# img_in2 Matrix[Double] --- Second input image
+# weight Double --- The weight given to the second image.
+# 0 means only img_in1, 1 means only img_in2 will be visible
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output image
+# ----------------------------------------------------------------------------------------------------------------------
m_img_sample_pairing = function(Matrix[Double] img_in1, Matrix[Double] img_in2, Double weight) return (Matrix[Double] img_out) {
if (weight < 0 | 1 < weight) {
diff --git a/scripts/builtin/img_shear.dml b/scripts/builtin/img_shear.dml
index df59a3c..745a254 100644
--- a/scripts/builtin/img_shear.dml
+++ b/scripts/builtin/img_shear.dml
@@ -19,23 +19,25 @@
#
#-------------------------------------------------------------
-# Image Shear
-# Applies a shearing transformation to an image.
+# This function applies a shearing transformation to an image.
# Uses nearest neighbor sampling.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in Matrix --- Input image as 2D matrix with top left corner at [1, 1]
-# shear_x Int --- Shearing factor for horizontal shearing
-# shear_y Int --- Shearing factor for vertical shearing
-# fill_value Double --- The background color revealed by the shearing
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image as 2D matrix with top left corner at [1, 1]
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input image as 2D matrix with top left corner at [1, 1]
+# shear_x Double --- Shearing factor for horizontal shearing
+# shear_y Double --- Shearing factor for vertical shearing
+# fill_value Double --- The background color revealed by the shearing
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output image as 2D matrix with top left corner at [1, 1]
+# ----------------------------------------------------------------------------------------------------------------------
m_img_shear = function(Matrix[Double] img_in, Double shear_x, Double shear_y, Double fill_value) return (Matrix[Double] img_out) {
img_out = img_transform(img_in, ncol(img_in), nrow(img_in), 1, shear_x, 0, shear_y, 1, 0, fill_value)
diff --git a/scripts/builtin/img_transform.dml b/scripts/builtin/img_transform.dml
index 823d7aa..13571ae 100644
--- a/scripts/builtin/img_transform.dml
+++ b/scripts/builtin/img_transform.dml
@@ -19,27 +19,30 @@
#
#-------------------------------------------------------------
-# Image Transform
-# Applies an affine transformation to an image.
+# The Image Transform function applies an affine transformation to an image.
# Optionally resizes the image (without scaling).
# Uses nearest neighbor sampling.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in Matrix --- Input image as 2D matrix with top left corner at [1, 1]
-# out_w Int --- Width of the output image
-# out_h Int --- Height of the output image
-# abcdef Int --- The first two rows of the affine matrix in row-major order
-# fill_value Double --- The background of the image
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image as 2D matrix with top left corner at [1, 1]
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input image as 2D matrix with top left corner at [1, 1]
+# out_w Integer --- Width of the output image
+# out_h Integer --- Height of the output image
+# a,b,c,d,e,f Double --- The first two rows of the affine matrix in row-major order
+# fill_value Double --- The background of the image
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix Output image as 2D matrix with top left corner at [1, 1]
+# ----------------------------------------------------------------------------------------------------------------------
-m_img_transform = function(Matrix[Double] img_in, Integer out_w, Integer out_h, Double a, Double b, Double c, Double d, Double e, Double f, Double fill_value) return (Matrix[Double] img_out) {
+m_img_transform = function(Matrix[Double] img_in, Integer out_w, Integer out_h, Double a, Double b, Double c, Double d,
+ Double e, Double f, Double fill_value) return (Matrix[Double] img_out) {
divisor = a * e - b * d
if(divisor == 0) {
print("Inverse matrix does not exist! Returning input.")
diff --git a/scripts/builtin/img_translate.dml b/scripts/builtin/img_translate.dml
index 6194c91..68c3aca 100644
--- a/scripts/builtin/img_translate.dml
+++ b/scripts/builtin/img_translate.dml
@@ -19,28 +19,31 @@
#
#-------------------------------------------------------------
-# Image Translate
-# Translates the image.
+# The Image Translate function translates the image.
# Optionally resizes the image (without scaling).
# Uses nearest neighbor sampling.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# img_in Matrix --- Input image as 2D matrix with top left corner at [1, 1]
-# offset_x Double --- The distance to move the image in x direction
-# offset_y Double --- The distance to move the image in y direction
-# out_w Int --- Width of the output image
-# out_h Int --- Height of the output image
-# fill_value Double --- The background of the image
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# img_out Matrix --- Output image as 2D matrix with top left corner at [1, 1]
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_in Matrix[Double] --- Input image as 2D matrix with top left corner at [1, 1]
+# offset_x Double --- The distance to move the image in x direction
+# offset_y Double --- The distance to move the image in y direction
+# out_w Int --- Width of the output image
+# out_h Int --- Height of the output image
+# fill_value Double --- The background of the image
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# img_out Matrix[Double] Output image as 2D matrix with top left corner at [1, 1]
+# ----------------------------------------------------------------------------------------------------------------------
-m_img_translate = function(Matrix[Double] img_in, Double offset_x, Double offset_y, Integer out_w, Integer out_h, Double fill_value) return (Matrix[Double] img_out) {
+m_img_translate = function(Matrix[Double] img_in, Double offset_x, Double offset_y, Integer out_w, Integer out_h, Double fill_value)
+ return (Matrix[Double] img_out) {
w = ncol(img_in)
h = nrow(img_in)
diff --git a/scripts/builtin/imputeByFD.dml b/scripts/builtin/imputeByFD.dml
index 9ee7e15..2587d5d 100644
--- a/scripts/builtin/imputeByFD.dml
+++ b/scripts/builtin/imputeByFD.dml
@@ -19,25 +19,24 @@
#
#-------------------------------------------------------------
-# Implements builtin for imputing missing values from observed values (if exist)
-# using robust functional dependencies
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double -- Matrix X
-# source Integer -- source attribute to use for imputation and error correction
-# target Integer -- attribute to be fixed
-# threshold Double -- threshold value in interval [0, 1] for robust FDs
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix with possible imputations
+# Implements builtin for imputing missing values from observed values (if exist) using robust functional dependencies
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X
+# source Integer --- source attribute to use for imputation and error correction
+# target Integer --- attribute to be fixed
+# threshold Double --- threshold value in interval [0, 1] for robust FDs
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] Matrix with possible imputations
+# ----------------------------------------------------------------------------------------------------------------------
m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer targetAttribute, Double threshold, Boolean verbose = FALSE)
return(Matrix[Double] X)
diff --git a/scripts/builtin/imputeByMean.dml b/scripts/builtin/imputeByMean.dml
index 7e90388..361ff8b 100644
--- a/scripts/builtin/imputeByMean.dml
+++ b/scripts/builtin/imputeByMean.dml
@@ -19,27 +19,23 @@
#
#-------------------------------------------------------------
-# Related to [SYSTEMDS-2662] dependency function for cleaning pipelines
-
# impute the data by mean value and if the feature is categorical then by mode value
-
+# Related to [SYSTEMDS-2662] dependency function for cleaning pipelines
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Data Matrix (Recoded Matrix for categorical features)
-# mask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- imputed dataset
-
-
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Data Matrix (Recoded Matrix for categorical features)
+# mask Matrix[Double] --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] imputed dataset
+# ----------------------------------------------------------------------------------------------------------------------
m_imputeByMean = function(Matrix[Double] X, Matrix[Double] mask)
return(Matrix[Double] X)
diff --git a/scripts/builtin/imputeByMedian.dml b/scripts/builtin/imputeByMedian.dml
index ff06dac..1d531e5 100644
--- a/scripts/builtin/imputeByMedian.dml
+++ b/scripts/builtin/imputeByMedian.dml
@@ -19,31 +19,27 @@
#
#-------------------------------------------------------------
-# Related to [SYSTEMDS-2662] dependency function for cleaning pipelines
-
# impute the data by median value and if the feature is categorical then by mode value
-
+# Related to [SYSTEMDS-2662] dependency function for cleaning pipelines
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Data Matrix (Recoded Matrix for categorical features)
-# mask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- imputed dataset
-
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Data Matrix (Recoded Matrix for categorical features)
+# mask Matrix[Double] --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] imputed dataset
+# ----------------------------------------------------------------------------------------------------------------------
m_imputeByMedian = function(Matrix[Double] X, Matrix[Double] mask)
return(Matrix[Double] X)
{
-
nX = removeEmpty(target=X, margin="cols", select=(mask==0))
cX = removeEmpty(target=X, margin="cols", select=mask)
Mask_n = is.na(nX);
@@ -62,4 +58,4 @@ return(Matrix[Double] X)
q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows",
select=t(mask)), ncol(cX), ncol(X))
X = (X_n %*% p) + (X_c %*% q)
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/imputeByMode.dml b/scripts/builtin/imputeByMode.dml
index 0d55de5..db37e2e 100644
--- a/scripts/builtin/imputeByMode.dml
+++ b/scripts/builtin/imputeByMode.dml
@@ -19,29 +19,26 @@
#
#-------------------------------------------------------------
+# This function impute the data by mode value
# Related to [SYSTEMDS-2902] dependency function for cleaning pipelines
-
-# impute the data by mode value
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Data Matrix (Recoded Matrix for categorical features)
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- imputed dataset
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Data Matrix (Recoded Matrix for categorical features)
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] imputed dataset
+# ----------------------------------------------------------------------------------------------------------------------
m_imputeByMode = function(Matrix[Double] X)
return(Matrix[Double] X)
{
-
Mask = is.na(X)
X = replace(target=X, pattern=NaN, replacement=0)
colMode = matrix(0, 1, ncol(X))
diff --git a/scripts/builtin/intersect.dml b/scripts/builtin/intersect.dml
index 747e1bf..f4ab3a5 100644
--- a/scripts/builtin/intersect.dml
+++ b/scripts/builtin/intersect.dml
@@ -20,21 +20,21 @@
#-------------------------------------------------------------
# Implements set intersection for numeric data
-
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- matrix X, set A
-# Y Double --- matrix Y, set B
-# ---------------------------------------------------------------------------------------------
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix X, set A
+# Y Matrix[Double] --- matrix Y, set B
+# ----------------------------------------------------------------------------------------------------------------------
+#
# Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# R Double --- intersection matrix, set of intersecting items
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# R Matrix[Double] intersection matrix, set of intersecting items
+# ----------------------------------------------------------------------------------------------------------------------
m_intersect = function(Matrix[Double] X, Matrix[Double] Y)
return(Matrix[Double] R)
diff --git a/scripts/builtin/km.dml b/scripts/builtin/km.dml
index 02ca447..32194b3 100644
--- a/scripts/builtin/km.dml
+++ b/scripts/builtin/km.dml
@@ -19,67 +19,73 @@
#
#-------------------------------------------------------------
-#
# Builtin function that implements the analysis of survival data with KAPLAN-MEIER estimates
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Input matrix X containing the survival data:
-# timestamps, whether event occurred (1) or data is censored (0), and a
-# number of factors (categorical features) for grouping and/or stratifying
-# TE Matrix --- Column indices of X which contain timestamps (first entry) and event
-# information (second entry)
-# GI Matrix --- Column indices of X corresponding to the factors to be used for grouping
-# SI Matrix --- Column indices of X corresponding to the factors to be used for stratifying
-# alpha Double 0.05 Parameter to compute 100*(1-alpha)% confidence intervals for the survivor
-# function and its median
-# err_type String "greenwood" Parameter to specify the error type according to "greenwood" (the default) or "peto"
-# conf_type String "log" Parameter to modify the confidence interval; "plain" keeps the lower and
-# upper bound of the confidence interval unmodified, "log" (the default)
-# corresponds to logistic transformation and "log-log" corresponds to the
-# complementary log-log transformation
-# test_type String "none" If survival data for multiple groups is available specifies which test to
-# perform for comparing survival data across multiple groups: "none" (the default)
-# "log-rank" or "wilcoxon" test
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- Matrix KM whose dimension depends on the number of groups (denoted by g) and strata (denoted by s) in the data:
-# each collection of 7 consecutive columns in KM corresponds to a unique combination of groups and strata in the data
-# with the following schema
-# 1. col: timestamp
-# 2. col: no. at risk
-# 3. col: no. of events
-# 4. col: Kaplan-Meier estimate of survivor function surv
-# 5. col: standard error of surv
-# 6. col: lower 100*(1-alpha)% confidence interval for surv
-# 7. col: upper 100*(1-alpha)% confidence interval for surv
-# 2- Matrix M whose dimension depends on the number of groups (g) and strata (s) in the data (k denotes the number
-# of factors used for grouping ,i.e., ncol(GI) and l denotes the number of factors used for stratifying, i.e., ncol(SI))
-# M[,1:k]: unique combination of values in the k factors used for grouping
-# M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
-# M[,k+l+1]: total number of records
-# M[,k+l+2]: total number of events
-# M[,k+l+3]: median of surv
-# M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
-# M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
-# If the number of groups and strata is equal to 1, M will have 4 columns with
-# M[,1]: total number of events
-# M[,2]: median of surv
-# M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
-# M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
-# 3- If survival data from multiple groups available and ttype=log-rank or wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
-# T_GROUPS_OE[,1] = no. of events
-# T_GROUPS_OE[,2] = observed value (O)
-# T_GROUPS_OE[,3] = expected value (E)
-# T_GROUPS_OE[,4] = (O-E)^2/E
-# T_GROUPS_OE[,5] = (O-E)^2/V
-# T[1,1] = no. of groups
-# T[1,2] = degree of freedom for Chi-squared distributed test statistic
-# T[1,3] = test statistic
-# T[1,4] = P-value
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input matrix X containing the survival data:
+# timestamps, whether event occurred (1) or data is censored (0), and a
+# number of factors (categorical features) for grouping and/or stratifying
+# TE Matrix[Double] --- Column indices of X which contain timestamps (first entry) and event
+# information (second entry)
+# GI Matrix[Double] --- Column indices of X corresponding to the factors to be used for grouping
+# SI Matrix[Double] --- Column indices of X corresponding to the factors to be used for stratifying
+# alpha Double 0.05 Parameter to compute 100*(1-alpha)% confidence intervals for the survivor
+# function and its median
+# err_type String "greenwood" Parameter to specify the error type according to "greenwood" (the default) or "peto"
+# conf_type String "log" Parameter to modify the confidence interval; "plain" keeps the lower and
+# upper bound of the confidence interval unmodified, "log" (the default)
+# corresponds to logistic transformation and "log-log" corresponds to the
+# complementary log-log transformation
+# test_type String "none" If survival data for multiple groups is available specifies which test to
+# perform for comparing survival data across multiple groups: "none" (the default)
+# "log-rank" or "wilcoxon" test
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# O Matrix[Double] Matrix KM whose dimension depends on the number of groups (denoted by g) and
+# strata (denoted by s) in the data:
+# each collection of 7 consecutive columns in KM corresponds to a unique
+# combination of groups and strata in the data with the following schema
+# 1. col: timestamp
+# 2. col: no. at risk
+# 3. col: no. of events
+# 4. col: Kaplan-Meier estimate of survivor function surv
+# 5. col: standard error of surv
+# 6. col: lower 100*(1-alpha)% confidence interval for surv
+# 7. col: upper 100*(1-alpha)% confidence interval for surv
+# M Matrix[Double] Matrix M whose dimension depends on the number of groups (g) and strata (s) in
+# the data (k denotes the number of factors used for grouping ,i.e., ncol(GI) and
+# l denotes the number of factors used for stratifying, i.e., ncol(SI))
+# M[,1:k]: unique combination of values in the k factors used for grouping
+# M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
+# M[,k+l+1]: total number of records
+# M[,k+l+2]: total number of events
+# M[,k+l+3]: median of surv
+# M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
+# M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
+# If the number of groups and strata is equal to 1, M will have 4 columns with
+# M[,1]: total number of events
+# M[,2]: median of surv
+# M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
+# M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
+# T_GROUPS_OE Matrix[Double] If survival data from multiple groups available and ttype=log-rank or wilcoxon,
+# a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
+# T_GROUPS_OE[,1] = no. of events
+# T_GROUPS_OE[,2] = observed value (O)
+# T_GROUPS_OE[,3] = expected value (E)
+# T_GROUPS_OE[,4] = (O-E)^2/E
+# T_GROUPS_OE[,5] = (O-E)^2/V
+# T[1,1] = no. of groups
+# T[1,2] = degree of freedom for Chi-squared distributed test statistic
+# T[1,3] = test statistic
+# T[1,4] = P-value
+# ----------------------------------------------------------------------------------------------------------------------
m_km = function(Matrix[Double] X, Matrix[Double] TE, Matrix[Double] GI, Matrix[Double] SI,
Double alpha = 0.05, String err_type = "greenwood", String conf_type = "log", String test_type = "none")
diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index 33aaa03..45b74d1 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml
@@ -22,27 +22,27 @@
# Builtin function that implements the k-Means clustering algorithm
#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Matrix --- The input Matrix to do KMeans on.
-# k Int 10 Number of centroids
-# runs Int 10 Number of runs (with different initial centroids)
-# max_iter Int 1000 Maximum number of iterations per run
-# eps Double 0.000001 Tolerance (epsilon) for WCSS change ratio
-# is_verbose Boolean FALSE do not print per-iteration stats
-# avg_sample_size_per_centroid Int 50 Average number of records per centroid in data samples
-# seed Int -1 The seed used for initial sampling. If set to -1 random seeds are selected.
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- The input Matrix to do KMeans on.
+# k Int 10 Number of centroids
+# runs Int 10 Number of runs (with different initial centroids)
+# max_iter Int 1000 Maximum number of iterations per run
+# eps Double 0.000001 Tolerance (epsilon) for WCSS change ratio
+# is_verbose Boolean FALSE do not print per-iteration stats
+# avg_sample_size_per_centroid Int 50 Average number of records per centroid in data samples
+# seed Int -1 The seed used for initial sampling. If set to -1
+# random seeds are selected.
+# ----------------------------------------------------------------------------------------------------------------------
#
-#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# Y String "Y.mtx" The mapping of records to centroids
-# C String "C.mtx" The output matrix with the centroids
-# ----------------------------------------------------------------------------
-
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] The mapping of records to centroids
+# C Matrix[Double] The output matrix with the centroids
+# ----------------------------------------------------------------------------------------------------------------------
m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer max_iter = 1000,
Double eps = 0.000001, Boolean is_verbose = FALSE, Integer avg_sample_size_per_centroid = 50,
diff --git a/scripts/builtin/kmeansPredict.dml b/scripts/builtin/kmeansPredict.dml
index 63c9bc4..7e80e97 100644
--- a/scripts/builtin/kmeansPredict.dml
+++ b/scripts/builtin/kmeansPredict.dml
@@ -17,24 +17,24 @@
# specific language governing permissions and limitations
# under the License.
#
-#-----------------------------------------------------------------------------
+#-------------------------------------------------------------
# Builtin function that does predictions based on a set of centroids provided.
#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Matrix --- The input Matrix to do KMeans on.
-# C Matrix --- The input Centroids to map X onto.
+# -----------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# -----------------------------------------------------------------------------
+# X Matrix[Double] --- The input Matrix to do KMeans on.
+# C Matrix[Double] --- The input Centroids to map X onto.
+# -----------------------------------------------------------------------------
#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# Y String "Y.mtx" The mapping of records to centroids
-# ----------------------------------------------------------------------------
-
+# OUTPUT:
+# -----------------------------------------------------------------------------
+# NAME TYPE MEANING
+# -----------------------------------------------------------------------------
+# Y Matrix[Double] The mapping of records to centroids
+# -----------------------------------------------------------------------------
m_kmeansPredict = function(Matrix[Double] X, Matrix[Double] C)
return (Matrix[Double] Y)
diff --git a/scripts/builtin/knn.dml b/scripts/builtin/knn.dml
index 8e86ba3..8a24713 100644
--- a/scripts/builtin/knn.dml
+++ b/scripts/builtin/knn.dml
@@ -19,39 +19,41 @@
#
#-------------------------------------------------------------
-# THIS SCRIPT IMPLEMENTS KNN( K Nearest Neighbor ) ALGORITHM
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT OPTIONAL MEANING
-# ---------------------------------------------------------------------------------------------
-# Train Matrix --- N The input matrix as features
-# Test Matrix --- N The input matrix for nearest neighbor search
-# CL Matrix --- Y The input matrix as target
-# CL_T Integer 0 Y The target type of matrix CL whether
-# columns in CL are continuous ( =1 ) or
-# categorical ( =2 ) or
-# not specified ( =0 )
-# trans_continuous Boolean FALSE Y Option flag for continuous feature transformed to [-1,1]:
-# FALSE = do not transform continuous variable;
-# TRUE = transform continuous variable;
-# k_value int 5 Y k value for KNN, ignore if select_k enable
-# select_k Boolean FALSE Y Use k selection algorithm to estimate k
-# ( TRUE means yes )
-# k_min int 1 Y Min k value( available if select_k = 1 )
-# k_max int 100 Y Max k value( available if select_k = 1 )
-# select_feature Boolean FALSE Y Use feature selection algorithm to select feature
-# ( TRUE means yes )
-# feature_max int 10 Y Max feature selection
-# interval int 1000 Y Interval value for K selecting ( available if select_k = 1 )
-# feature_importance Boolean FALSE Y Use feature importance algorithm to estimate each feature
-# ( TRUE means yes )
-# predict_con_tg int 0 Y Continuous target predict function: mean(=0) or
-# median(=1)
-# START_SELECTED Matrix --- Y feature selection initinal value
-# ---------------------------------------------------------------------------------------------
-# OUTPUT: Matrix NNR, Matrix PR, Matrix FEATURE_IMPORTANCE_VALUE
-#
+# This script implements KNN (K Nearest Neighbor) algorithm.
+
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT OPTIONAL MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Train Matrix[Double] --- N The input matrix as features
+# Test Matrix[Double] --- N The input matrix for nearest neighbor search
+# CL Matrix[Double] --- Y The input matrix as target
+# CL_T Integer 0 Y The target type of matrix CL whether
+# columns in CL are continuous ( =1 ) or
+# categorical ( =2 ) or not specified ( =0 )
+# trans_continuous Boolean FALSE Y Option flag for continuous feature transformed to [-1,1]:
+# FALSE = do not transform continuous variable;
+# TRUE = transform continuous variable;
+# k_value int 5 Y k value for KNN, ignore if select_k enable
+# select_k Boolean FALSE Y Use k selection algorithm to estimate k (TRUE means yes)
+# k_min int 1 Y Min k value( available if select_k = 1 )
+# k_max int 100 Y Max k value( available if select_k = 1 )
+# select_feature Boolean FALSE Y Use feature selection algorithm to select feature (TRUE means yes)
+# feature_max int 10 Y Max feature selection
+# interval int 1000 Y Interval value for K selecting ( available if select_k = 1 )
+# feature_importance Boolean FALSE Y Use feature importance algorithm to estimate each feature
+# (TRUE means yes)
+# predict_con_tg int 0 Y Continuous target predict function: mean(=0) or median(=1)
+# START_SELECTED Matrix[Double] Empty Y feature selection initinal value
+# ----------------------------------------------------------------------------------------------------------------------
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# NNR_matrix Matrix[Double] ---
+# CL_matrix Matrix[Double] ---
+# m_feature_importance Matrix[Double] Feature importance value
+# ----------------------------------------------------------------------------------------------------------------------
m_knn = function(
Matrix[Double] Train,
diff --git a/scripts/builtin/knnbf.dml b/scripts/builtin/knnbf.dml
index 1146680..4c9540a 100644
--- a/scripts/builtin/knnbf.dml
+++ b/scripts/builtin/knnbf.dml
@@ -19,13 +19,26 @@
#
#-------------------------------------------------------------
-m_knnbf = function(
- Matrix[Double] X,
- Matrix[Double] T,
- Integer k_value = 5
- ) return(
- Matrix[Double] NNR
- )
+# This script implements KNN (K Nearest Neighbor) algorithm.
+#
+# INPUT PARAMETERS:
+# -----------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# -----------------------------------------------------------
+# X Matrix[Double] --- ---
+# T Matrix[Double] --- ---
+# k_value Integer 5 ---
+# -----------------------------------------------------------
+#
+# OUTPUT:
+# -----------------------------------------------------------
+# NAME TYPE MEANING
+# -----------------------------------------------------------
+# NNR Matrix[Double] ---
+# -----------------------------------------------------------
+
+m_knnbf = function(Matrix[Double] X, Matrix[Double] T, Integer k_value = 5)
+ return(Matrix[Double] NNR)
{
num_records = nrow(X);
num_queries = nrow(T);
diff --git a/scripts/builtin/l2svm.dml b/scripts/builtin/l2svm.dml
index 8d98b90..9fa6a71 100644
--- a/scripts/builtin/l2svm.dml
+++ b/scripts/builtin/l2svm.dml
@@ -19,31 +19,31 @@
#
#-------------------------------------------------------------
-
# Builtin function Implements binary-class SVM with squared slack variables
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- matrix X of feature vectors
-# Y Matrix --- matrix Y of class labels have to be a single column
-# intercept Boolean False No Intercept ( If set to TRUE then a constant bias column is added to X)
-# epsilon Double 0.001 Procedure terminates early if the reduction in objective function value is less than epsilon (tolerance) times the initial objective function value.
-# lambda Double 1.0 Regularization parameter (lambda) for L2 regularization
-# maxIterations Int 100 Maximum number of conjugate gradient iterations
-# maxii Int 20 -
-# verbose Boolean FALSE Set to true if one wants print statements updating on loss.
-# columnId Int -1 The column Id used if one wants to add a ID to the print statement, Specificly usefull when L2SVM is used in MSVM.
-# ---------------------------------------------------------------------------------------------
-
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# model Double --- model matrix
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix X of feature vectors
+# Y Matrix[Double] --- matrix Y of class labels have to be a single column
+# intercept Boolean False No Intercept ( If set to TRUE then a constant bias column is added to X)
+# epsilon Double 0.001 Procedure terminates early if the reduction in objective function value is less
+# than epsilon (tolerance) times the initial objective function value.
+# lambda Double 1.0 Regularization parameter (lambda) for L2 regularization
+# maxIterations Int 100 Maximum number of conjugate gradient iterations
+# maxii Int 20 -
+# verbose Boolean FALSE Set to true if one wants print statements updating on loss.
+# columnId Int -1 The column Id used if one wants to add a ID to the print statement, Specificly
+# usefull when L2SVM is used in MSVM.
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# model Matrix[Double] model matrix
+# ----------------------------------------------------------------------------------------------------------------------
m_l2svm = function(Matrix[Double] X, Matrix[Double] Y, Boolean intercept = FALSE,
Double epsilon = 0.001, Double lambda = 1, Integer maxIterations = 100,
diff --git a/scripts/builtin/l2svmPredict.dml b/scripts/builtin/l2svmPredict.dml
index 87136d5..960ffd9 100644
--- a/scripts/builtin/l2svmPredict.dml
+++ b/scripts/builtin/l2svmPredict.dml
@@ -19,25 +19,25 @@
#
#-------------------------------------------------------------
+# Builtin function Implements binary-class SVM with squared slack variables.
-#
-#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- matrix X of feature vectors to classify
-# W Double --- matrix of the trained variables
-# verbose Boolean FALSE Set to true if one wants print statements.
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix X of feature vectors to classify
+# W Matrix[Double] --- matrix of the trained variables
+# verbose Boolean FALSE Set to true if one wants print statements.
+# ----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Y^ Double --- Classification Labels Raw, meaning not modified to clean
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# YRaw Matrix[Double] Classification Labels Raw, meaning not modified to clean
# Labeles of 1's and -1's
-# Y Double --- Classification Labels Maxed to ones and zeros.
-
+# Y Matrix[Double] Classification Labels Maxed to ones and zeros.
+# ----------------------------------------------------------------------------------------------------------------------
m_l2svmPredict = function(Matrix[Double] X, Matrix[Double] W, Boolean verbose = FALSE)
return(Matrix[Double] YRaw, Matrix[Double] Y)
@@ -53,4 +53,4 @@ m_l2svmPredict = function(Matrix[Double] X, Matrix[Double] W, Boolean verbose =
YRaw = X %*% W
Y = rowIndexMax(YRaw)
}
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/lasso.dml b/scripts/builtin/lasso.dml
index 6fc6027..8bb6e19 100644
--- a/scripts/builtin/lasso.dml
+++ b/scripts/builtin/lasso.dml
@@ -23,19 +23,24 @@
# (SpaRSA .. Sparse Reconstruction by Separable Approximation)
#
# INPUTS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- input feature matrix
-# y Double --- matrix Y columns of the design matrix
-# tol Double 1e-15 target convergence tolerance
-# M Integer 5 history length
-# tau Double 1 regularization component
-# maxi Integer 100 maximum number of iterations until convergence
-# ---------------------------------------------------------------------------------------------
-# OUTPUTS
-# w Double --- model matrix
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- input feature matrix
+# y Matrix[Double] --- matrix Y columns of the design matrix
+# tol Double 1e-15 target convergence tolerance
+# M Integer 5 history length
+# tau Double 1 regularization component
+# maxi Integer 100 maximum number of iterations until convergence
+# verbose Boolean True if the builtin should be verbose
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# w Matrix[Double] model matrix
+# ----------------------------------------------------------------------------------------------------------------------
m_lasso = function(Matrix[Double] X, Matrix[Double] y, Double tol = 1e-15,
Integer M = 5, Double tau = 1, Integer maxi = 100, Boolean verbose = TRUE)
diff --git a/scripts/builtin/lenetPredict.dml b/scripts/builtin/lenetPredict.dml
index 9b6af37..ca184b0 100644
--- a/scripts/builtin/lenetPredict.dml
+++ b/scripts/builtin/lenetPredict.dml
@@ -16,24 +16,30 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+#
#-------------------------------------------------------------
# This builtin function makes prediction given data and trained LeNet model
#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# model List[unknown] --- Trained LeNet model
# X Matrix[Double] --- Input data matrix, of shape (N, C*Hin*Win)
# C Integer --- Number of input channels
# Hin Integer --- Input height
# Win Integer --- Input width
# batch_size Integer --- Batch size
-# --------------------------------------------------------------------------------------------
-# OUTPUT:
-# pred Matrix[Double] --- Predicted values
+# ----------------------------------------------------------------------------------------------------------------------
#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# probs Matrix[Double] Predicted values
+# ----------------------------------------------------------------------------------------------------------------------
+
source("nn/layers/lenetForwardPass.dml") as lenet_fw
s_lenetPredict = function(list[unknown] model, Matrix[Double] X, Integer C,
diff --git a/scripts/builtin/lenetTrain.dml b/scripts/builtin/lenetTrain.dml
index a26467a..4f02c2d 100644
--- a/scripts/builtin/lenetTrain.dml
+++ b/scripts/builtin/lenetTrain.dml
@@ -19,18 +19,17 @@
#
#-------------------------------------------------------------
-#
# This builtin function trains LeNet CNN. The architecture of the
-# networks is:conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 ->
+# networks is:conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 ->
# affine3 -> relu3 -> affine4 -> softmax
-#
+
# INPUT PARAMETERS:
# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------------------
-# X Matrix[double] --- Input data matrix, of shape (N, C*Hin*Win)
+# X Matrix[Double] --- Input data matrix, of shape (N, C*Hin*Win)
# Y Matrix[Double] --- Target matrix, of shape (N, K)
-# X_val Matrix[double] --- Validation data matrix, of shape (N, C*Hin*Win)
+# X_val Matrix[Double] --- Validation data matrix, of shape (N, C*Hin*Win)
# Y_val Matrix[Double] --- Validation target matrix, of shape (N, K)
# C Integer --- Number of input channels (dimensionality of input depth)
# Hin Integer --- Input width
@@ -43,10 +42,14 @@
# lambda Double 5e-04 Regularization strength
# seed Integer -1 Seed for model initialization
# verbose Boolean FALSE Flag indicates if function should print to stdout
-# --------------------------------------------------------------------------------------------
-# OUTPUT:
-# model List[unknown] --- Trained model which can be used in lenetPredict
+# ----------------------------------------------------------------------------------------------------------------------
#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# model List[unknown] Trained model which can be used in lenetPredict
+# ----------------------------------------------------------------------------------------------------------------------
source("nn/layers/affine.dml") as affine
source("nn/layers/conv2d_builtin.dml") as conv2d
diff --git a/scripts/builtin/lm.dml b/scripts/builtin/lm.dml
index b990c34..2ffba76 100644
--- a/scripts/builtin/lm.dml
+++ b/scripts/builtin/lm.dml
@@ -19,26 +19,29 @@
#
#-------------------------------------------------------------
-#
+# The lm-function solves linear regression using either the direct solve method or the conjugate gradient
+# algorithm depending on the input size of the matrices (See lmDS-function and lmCG-function respectively).
+
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Matrix --- Matrix of feature vectors.
-# y Matrix --- 1-column matrix of response values.
-# icpt Integer 0 Intercept presence, shifting and rescaling the columns of X
-# reg Double 1e-7 Regularization constant (lambda) for L2-regularization. set to nonzero for highly dependant/sparse/numerous features
-# tol Double 1e-7 Tolerance (epsilon); conjugate gradient procedure terminates early if L2 norm of the beta-residual is less than tolerance * its initial norm
-# maxi Integer 0 Maximum number of conjugate gradient iterations. 0 = no maximum
-# verbose Boolean TRUE If TRUE print messages are activated
-#
-#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# B String "B.mtx" The model fit
-# ----------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors.
+# y Matrix[Double] --- 1-column matrix of response values.
+# icpt Integer 0 Intercept presence, shifting and rescaling the columns of X
+# reg Double 1e-7 Regularization constant (lambda) for L2-regularization. set to nonzero
+# for highly dependant/sparse/numerous features
+# tol Double 1e-7 Tolerance (epsilon); conjugate gradient procedure terminates early if L2
+# norm of the beta-residual is less than tolerance * its initial norm
+# maxi Integer 0 Maximum number of conjugate gradient iterations. 0 = no maximum
+# verbose Boolean TRUE If TRUE print messages are activated
+# ----------------------------------------------------------------------------------------------------------------------
+# OUTPUT
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# B Matrix[Double] The model fit
+# ----------------------------------------------------------------------------------------------------------------------
m_lm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
return (Matrix[Double] B) {
diff --git a/scripts/builtin/lmCG.dml b/scripts/builtin/lmCG.dml
index faecc36..73fcc18 100644
--- a/scripts/builtin/lmCG.dml
+++ b/scripts/builtin/lmCG.dml
@@ -19,7 +19,31 @@
#
#-------------------------------------------------------------
-m_lmCG = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE) return (Matrix[Double] B) {
+# The lmCG function solves linear regression using the conjugate gradient algorithm
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors.
+# y Matrix[Double] --- 1-column matrix of response values.
+# icpt Integer 0 Intercept presence, shifting and rescaling the columns of X
+# reg Double 1e-7 Regularization constant (lambda) for L2-regularization. set to nonzero
+# for highly dependant/sparse/numerous features
+# tol Double 1e-7 Tolerance (epsilon); conjugate gradient procedure terminates early if L2
+# norm of the beta-residual is less than tolerance * its initial norm
+# maxi Integer 0 Maximum number of conjugate gradient iterations. 0 = no maximum
+# verbose Boolean TRUE If TRUE print messages are activated
+# ----------------------------------------------------------------------------------------------------------------------
+# OUTPUT
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# B Matrix[Double] The model fit
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_lmCG = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, Double reg = 1e-7, Double tol = 1e-7,
+ Integer maxi = 0, Boolean verbose = TRUE) return (Matrix[Double] B) {
intercept_status = icpt;
regularization = reg;
tolerance = tol;
diff --git a/scripts/builtin/lmDS.dml b/scripts/builtin/lmDS.dml
index 431c415..b5d4b41 100644
--- a/scripts/builtin/lmDS.dml
+++ b/scripts/builtin/lmDS.dml
@@ -19,7 +19,31 @@
#
#-------------------------------------------------------------
-m_lmDS = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, Double reg = 1e-7, Boolean verbose = TRUE) return (Matrix[Double] B) {
+# The lmDC function solves linear regression using the direct solve method
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors.
+# y Matrix[Double] --- 1-column matrix of response values.
+# icpt Integer 0 Intercept presence, shifting and rescaling the columns of X
+# reg Double 1e-7 Regularization constant (lambda) for L2-regularization. set to nonzero
+# for highly dependant/sparse/numerous features
+# tol Double 1e-7 Tolerance (epsilon); conjugate gradient procedure terminates early if L2
+# norm of the beta-residual is less than tolerance * its initial norm
+# maxi Integer 0 Maximum number of conjugate gradient iterations. 0 = no maximum
+# verbose Boolean TRUE If TRUE print messages are activated
+# ----------------------------------------------------------------------------------------------------------------------
+# OUTPUT
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# B Matrix[Double] The model fit
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_lmDS = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, Double reg = 1e-7,
+ Boolean verbose = TRUE) return (Matrix[Double] B) {
intercept_status = icpt;
regularization = reg;
diff --git a/scripts/builtin/lmPredict.dml b/scripts/builtin/lmPredict.dml
index 0570f50..20f8ca4 100644
--- a/scripts/builtin/lmPredict.dml
+++ b/scripts/builtin/lmPredict.dml
@@ -19,6 +19,27 @@
#
#-------------------------------------------------------------
+# The lmPredict-function predicts the class of a feature vector
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors
+# B Matrix[Double] --- 1-column matrix of weights.
+# ytest Matrix[Double] --- test labels, used only for verbose output. can be set to matrix(0,1,1)
+# if verbose output is not wanted
+# icpt Integer 0 Intercept presence, shifting and rescaling the columns of X
+# verbose Boolean TRUE If TRUE print messages are activated
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# yhat Matrix[Double] 1-column matrix of classes
+# ----------------------------------------------------------------------------------------------------------------------
+
m_lmPredict = function(Matrix[Double] X, Matrix[Double] B,
Matrix[Double] ytest = matrix(0,1,1), Integer icpt = 0, Boolean verbose = FALSE)
return (Matrix[Double] yhat)
diff --git a/scripts/builtin/logSumExp.dml b/scripts/builtin/logSumExp.dml
index 3114f84..ad88684 100644
--- a/scripts/builtin/logSumExp.dml
+++ b/scripts/builtin/logSumExp.dml
@@ -19,27 +19,24 @@
#
#-------------------------------------------------------------
-# ------------------------------------------
-# Built-in LOGSUMEXP
-# ------------------------------------------
-
-
+# Built-in LOGSUMEXP
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- matrix M
-# margin String none if the logsumexp of rows is required set margin = "row"
-# if the logsumexp of columns is required set margin = "col"
-# if set to "none" then a single scalar is returned computing logsumexp of matrix
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# output Double --- A 1*1 matrix, row vector or column vector depends on margin value
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix M
+# margin String none if the logsumexp of rows is required set margin = "row"
+# if the logsumexp of columns is required set margin = "col"
+# if set to "none" then a single scalar is returned computing logsumexp of matrix
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# output Matrix[Double] a 1*1 matrix, row vector or column vector depends on margin value
+# ----------------------------------------------------------------------------------------------------------------------
m_logSumExp = function(Matrix[Double] M, String margin = "none")
return(Matrix[Double] output)
diff --git a/scripts/builtin/matrixProfile.dml b/scripts/builtin/matrixProfile.dml
index e6fcbff..715704f 100644
--- a/scripts/builtin/matrixProfile.dml
+++ b/scripts/builtin/matrixProfile.dml
@@ -19,36 +19,35 @@
#
#-------------------------------------------------------------
-# ----------------------------------------------------------------------------
+# Builtin function that computes the MatrixProfile of a time series efficiently
+# using the SCRIMP++ algorithm.
+#
# References:
# Yan Zhu et al.. 2018.
# Matrix Profile XI: SCRIMP++: Time Series Motif Discovery at Interactive Speeds.
# 2018 IEEE International Conference on Data Mining (ICDM), 2018, pp. 837-846.
# DOI: 10.1109/ICDM.2018.00099.
# https://www.cs.ucr.edu/~eamonn/SCRIMP_ICDM_camera_ready_updated.pdf
-# ----------------------------------------------------------------------------
-
-# Builtin function that computes the MatrixProfile of a time series efficiently
-# using the SCRIMP++ algorithm.
#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# ts Matrix --- Time series to profile
-# window_size Integer 4 Sliding window size
-# sample_percent Double 1.0 Degree of approximation
-# between zero and one (1
-# computes the exact solution)
-# is_verbose Boolean False Print debug information
-#
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# ts Matrix[Double] --- Time series to profile
+# window_size Integer 4 Sliding window size
+# sample_percent Double 1.0 Degree of approximation
+# between zero and one (1
+# computes the exact solution)
+# is_verbose Boolean False Print debug information
+# ----------------------------------------------------------------------------------------------------------------------
#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# profile Matrix --- The computed matrix profile
-# profile_index Matrix --- Indices of least distances
-
+# OUTPUT
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# profile Matrix[Double] The computed matrix profile
+# profile_index Matrix[Double] Indices of least distances
+# ----------------------------------------------------------------------------------------------------------------------
m_matrixProfile = function(Matrix[Double] ts, Integer window_size=4, Double sample_percent=1.0, Boolean is_verbose=FALSE)
return(Matrix[Double] profile, Matrix[Double] profile_index)
diff --git a/scripts/builtin/mdedup.dml b/scripts/builtin/mdedup.dml
index 9ffb028..9edf4aa 100644
--- a/scripts/builtin/mdedup.dml
+++ b/scripts/builtin/mdedup.dml
@@ -17,29 +17,30 @@
# specific language governing permissions and limitations
# under the License.
#
-#------------------------------------------------------------------------------------------------------------------
+#-------------------------------------------------------------
# Implements builtin for deduplication using matching dependencies (e.g. Street 0.95, City 0.90 -> ZIP 1.0)
# and Jaccard distance.
#
# INPUT PARAMETERS:
-# -----------------------------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# -----------------------------------------------------------------------------------------------------------------
-# X Frame -- Input Frame X
-# LHSfeatures Matrix[Integer] -- A matrix 1xd with numbers of columns for MDs
-# (e.g. Street 0.95, City 0.90 -> ZIP 1.0)
-# LHSthreshold Matrix[Double] -- A matrix 1xd with threshold values in interval [0, 1] for MDs
-# RHSfeatures Matrix[Integer] -- A matrix 1xd with numbers of columns for MDs
-# RHSthreshold Matrix[Double] -- A matrix 1xd with threshold values in interval [0, 1] for MDs
-# verbose Boolean -- To print the output
-# -----------------------------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# X Frame --- Input Frame X
+# LHSfeatures Matrix[Integer] --- A matrix 1xd with numbers of columns for MDs
+# (e.g. Street 0.95, City 0.90 -> ZIP 1.0)
+# LHSthreshold Matrix[Double] --- A matrix 1xd with threshold values in interval [0, 1] for MDs
+# RHSfeatures Matrix[Integer] --- A matrix 1xd with numbers of columns for MDs
+# RHSthreshold Matrix[Double] --- A matrix 1xd with threshold values in interval [0, 1] for MDs
+# verbose Boolean --- To print the output
+# ----------------------------------------------------------------------------------------------------------------------
#
-# Output(s)
-# -----------------------------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# -----------------------------------------------------------------------------------------------------------------
-# MD Matrix[Double] --- Matrix nx1 of duplicates
+# Output:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# MD Matrix[Double] Matrix nx1 of duplicates
+# ----------------------------------------------------------------------------------------------------------------------
s_mdedup = function(Frame[String] X, Matrix[Double] LHSfeatures, Matrix[Double] LHSthreshold,
Matrix[Double] RHSfeatures, Matrix[Double] RHSthreshold, Boolean verbose)
diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index f5a373f..f1741ee 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -19,31 +19,32 @@
#
#-------------------------------------------------------------
-# Built-in function Implements Multiple Imputation using Chained Equations (MICE)
+# This Builtin function implements multiple imputation using Chained Equations (MICE)
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Data Matrix (Recoded Matrix for categorical features)
-# cMask Double --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
-# iter Integer 3 Number of iteration for multiple imputations
-# threshold Double 0.8 confidence value [0, 1] for robust imputation, values will only be imputed
-# if the predicted value has probability greater than threshold,
-# only applicable for categorical data
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# output Double --- imputed dataset
-
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Data Matrix (Recoded Matrix for categorical features)
+# cMask Matrix[Double] --- A 0/1 row vector for identifying numeric (0) and categorical features (1)
+# iter Integer 3 Number of iteration for multiple imputations
+# threshold Double 0.8 confidence value [0, 1] for robust imputation, values will only be imputed
+# if the predicted value has probability greater than threshold,
+# only applicable for categorical data
+# verbose Boolean FALSE Boolean value.
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# output Matrix[Double] imputed dataset
+# ----------------------------------------------------------------------------------------------------------------------
+#
# Assumption missing value are represented with empty string i.e ",," in CSV file
# variables with suffix n are storing continuos/numeric data and variables with
# suffix c are storing categorical data
+
m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3,
Double threshold = 0.8, Boolean verbose = FALSE)
return(Matrix[Double] output)
diff --git a/scripts/builtin/msvm.dml b/scripts/builtin/msvm.dml
index 88f26de..4775288 100644
--- a/scripts/builtin/msvm.dml
+++ b/scripts/builtin/msvm.dml
@@ -21,27 +21,28 @@
# Implements builtin multiclass SVM with squared slack variables,
# learns one-against-the-rest binary-class classifiers by making a function call to l2SVM
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- matrix X of feature vectors
-# Y Double --- matrix Y of class labels
-# intercept Boolean False No Intercept ( If set to TRUE then a constant bias column is added to X)
-# num_classes integer 10 Number of classes
-# epsilon Double 0.001 Procedure terminates early if the reduction in objective function
-# value is less than epsilon (tolerance) times the initial objective function value.
-# lambda Double 1.0 Regularization parameter (lambda) for L2 regularization
-# maxIterations Int 100 Maximum number of conjugate gradient iterations
-# verbose Boolean False Set to true to print while training.
-# ---------------------------------------------------------------------------------------------
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# model Double --- model matrix
+#-----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#-----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix X of feature vectors
+# Y Matrix[Double] --- matrix Y of class labels
+# intercept Boolean False No Intercept ( If set to TRUE then a constant bias column is added to X)
+# num_classes integer 10 Number of classes
+# epsilon Double 0.001 Procedure terminates early if the reduction in objective function
+# value is less than epsilon (tolerance) times the initial objective function value.
+# lambda Double 1.0 Regularization parameter (lambda) for L2 regularization
+# maxIterations Int 100 Maximum number of conjugate gradient iterations
+# verbose Boolean False Set to true to print while training.
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+#-----------------------------------------------------------------------------------------------------------------------
+# model Matrix[Double] model matrix
+#-----------------------------------------------------------------------------------------------------------------------
m_msvm = function(Matrix[Double] X, Matrix[Double] Y, Boolean intercept = FALSE,
Double epsilon = 0.001, Double lambda = 1.0, Integer maxIterations = 100,
diff --git a/scripts/builtin/msvmPredict.dml b/scripts/builtin/msvmPredict.dml
index 2b4fa42..4c7460f 100644
--- a/scripts/builtin/msvmPredict.dml
+++ b/scripts/builtin/msvmPredict.dml
@@ -20,21 +20,23 @@
#-------------------------------------------------------------
# This Scripts helps in applying an trained MSVM
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- matrix X of feature vectors to classify
-# W Double --- matrix of the trained variables
-# ---------------------------------------------------------------------------------------------
+#-----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#-----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix X of feature vectors to classify
+# W Matrix[Double] --- matrix of the trained variables
+#-----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Y^ Double --- Classification Labels Raw, meaning not modified to clean
-# Labeles of 1's and -1's
-# Y Double --- Classification Labels Maxed to ones and zeros.
+#-----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+#-----------------------------------------------------------------------------------------------------------------------
+# YRaw Matrix[Double] Classification Labels Raw, meaning not modified to clean
+# Labeles of 1's and -1's
+# Y Matrix[Double] Classification Labels Maxed to ones and zeros.
+#-----------------------------------------------------------------------------------------------------------------------
m_msvmPredict = function(Matrix[Double] X, Matrix[Double] W)
return(Matrix[Double] YRaw, Matrix[Double] Y)
diff --git a/scripts/builtin/multiLogReg.dml b/scripts/builtin/multiLogReg.dml
index a4fdf6a..d6412e8 100644
--- a/scripts/builtin/multiLogReg.dml
+++ b/scripts/builtin/multiLogReg.dml
@@ -23,28 +23,29 @@
# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650)
# The largest label represents the baseline category; if label -1 or 0 is present, then it is
# the baseline label (and it is converted to the largest label).
-
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X Matrix --- Location to read the matrix of feature vectors
-# Y Matrix --- Location to read the matrix with category labels
-# icpt Integer 0 Intercept presence, shifting and rescaling X columns: 0 = no intercept, no shifting, no rescaling; 1 = add intercept, but neither shift nor rescale X; 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# tol Double 0.000001 tolerance ("epsilon")
-# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized
-# maxi Integer 100 max. number of outer (Newton) iterations
-# maxii Integer 0 max. number of inner (conjugate gradient) iterations, 0 = no max
-# verbose Boolean FALSE flag specifying if logging information should be printed
#
-# --------------------------------------------------------------------------------------------
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Location to read the matrix of feature vectors
+# Y Matrix[Double] --- Location to read the matrix with category labels
+# icpt Integer 0 Intercept presence, shifting and rescaling X columns: 0 = no intercept,
+# no shifting, no rescaling; 1 = add intercept, but neither shift nor
+# rescale X; 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# tol Double 0.000001 tolerance ("epsilon")
+# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized
+# maxi Integer 100 max. number of outer (Newton) iterations
+# maxii Integer 0 max. number of inner (conjugate gradient) iterations, 0 = no max
+# verbose Boolean FALSE flag specifying if logging information should be printed
+# ----------------------------------------------------------------------------------------------------------------------
#
# OUTPUT:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# betas Double regression betas as output for prediction
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# betas Matrix[Double] regression betas as output for prediction
+# ----------------------------------------------------------------------------------------------------------------------
m_multiLogReg = function(Matrix[Double] X, Matrix[Double] Y, Int icpt = 2,
Double tol=1e-6, Double reg=1.0, Int maxi=100, Int maxii=20, Boolean verbose=TRUE)
diff --git a/scripts/builtin/multiLogRegPredict.dml b/scripts/builtin/multiLogRegPredict.dml
index a7bbc23..2b1f80c 100644
--- a/scripts/builtin/multiLogRegPredict.dml
+++ b/scripts/builtin/multiLogRegPredict.dml
@@ -19,27 +19,27 @@
#
#-------------------------------------------------------------
-
# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF MULTINOMIAL LOGISTIC REGRESSION TO A NEW (TEST) DATASET
# Matrix M of predicted means/probabilities, some statistics in CSV format (see below)
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Data Matrix X
-# B Matrix --- Regression parameters betas
-# Y Matrix --- Response vector Y
-# verbose Boolean FALSE /
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Data Matrix X
+# B Matrix[Double] --- Regression parameters betas
+# Y Matrix[Double] --- Response vector Y
+# verbose Boolean FALSE flag specifying if logging information should be printed
+# ----------------------------------------------------------------------------------------------------------------------
#
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# M Double --- Matrix M of predicted means/probabilities
-# predicted_Y Double --- Predicted response vector
-# accuracy Double --- scalar value of accuracy
-# ---------------------------------------------------------------------------------------------
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# M Matrix[Double] Matrix M of predicted means/probabilities
+# predicted_Y Matrix[Double] Predicted response vector
+# accuracy Double scalar value of accuracy
+# ----------------------------------------------------------------------------------------------------------------------
m_multiLogRegPredict = function(Matrix[Double] X, Matrix[Double] B, Matrix[Double] Y, Boolean verbose = FALSE)
return(Matrix[Double] M, Matrix[Double] predicted_Y, Double accuracy)
diff --git a/scripts/builtin/na_locf.dml b/scripts/builtin/na_locf.dml
index 4a6c530..30f7572 100644
--- a/scripts/builtin/na_locf.dml
+++ b/scripts/builtin/na_locf.dml
@@ -20,22 +20,23 @@
#-------------------------------------------------------------
# Builtin function for imputing missing values using forward fill and backward fill techniques
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix X
-# option String "locf" String "locf" (last observation moved forward) to do forward fill
-# String "nocb" (next observation carried backward) to do backward fill
-# verbose Boolean FALSE to print output on screen
-# ---------------------------------------------------------------------------------------------
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# output Double --- Matrix with no missing values
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X
+# option String "locf" String "locf" (last observation moved forward) to do forward fill
+# String "nocb" (next observation carried backward) to do backward fill
+# verbose Boolean FALSE to print output on screen
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# output Matrix[Double] Matrix with no missing values
+# ----------------------------------------------------------------------------------------------------------------------
m_na_locf = function(Matrix[Double] X, String option = "locf", Boolean verbose = FALSE)
return(Matrix[Double] output)
diff --git a/scripts/builtin/naiveBayes.dml b/scripts/builtin/naiveBayes.dml
index 7911cd0..888de14 100644
--- a/scripts/builtin/naiveBayes.dml
+++ b/scripts/builtin/naiveBayes.dml
@@ -19,6 +19,26 @@
#
#-------------------------------------------------------------
+# The naiveBayes-function computes the class conditional probabilities and class priors.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# D Matrix[Double] --- One dimensional column matrix with N rows.
+# C Matrix[Double] --- One dimensional column matrix with N rows.
+# Laplace Double 1 Any Double value.
+# Verbose Boolean TRUE Boolean value.
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# prior Matrix[Double] Class priors, One dimensional column matrix with N rows.
+# classConditionals Matrix[Double] Class conditional probabilites, One dimensional column matrix with N rows.
+# ----------------------------------------------------------------------------------------------------------------------
+
m_naiveBayes = function(Matrix[Double] D,
Matrix[Double] C, Double laplace = 1, Boolean verbose = TRUE)
return (Matrix[Double] prior, Matrix[Double] classConditionals)
diff --git a/scripts/builtin/naiveBayesPredict.dml b/scripts/builtin/naiveBayesPredict.dml
index 8ae938e..7efb965 100644
--- a/scripts/builtin/naiveBayesPredict.dml
+++ b/scripts/builtin/naiveBayesPredict.dml
@@ -19,6 +19,25 @@
#
#-------------------------------------------------------------
+# The naiveBaysePredict-function predicts the scoring with a naive Bayes model.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of test data with N rows.
+# P Matrix[Double] --- Class priors, One dimensional column matrix with N rows.
+# C Matrix[Double] --- Class conditional probabilities, matrix with N rows
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] A matrix containing the top-K item-ids with highest predicted ratings.
+# YRaw Matrix[Double] A matrix containing predicted ratings.
+# ----------------------------------------------------------------------------------------------------------------------
+
m_naiveBayesPredict = function(Matrix[Double] X, Matrix[Double] P, Matrix[Double] C)
return (Matrix[Double] YRaw, Matrix[Double] Y)
{
diff --git a/scripts/builtin/normalize.dml b/scripts/builtin/normalize.dml
index f7b86c2..9e72404 100644
--- a/scripts/builtin/normalize.dml
+++ b/scripts/builtin/normalize.dml
@@ -22,16 +22,21 @@
# Min-max normalization (a.k.a. min-max scaling) to range [0,1]. For matrices
# of positive values, this normalization preserves the input sparsity.
#
-# ------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix of shape n-by-m
-# ------------------------------------------------------------------------------
-# Y Matrix --- Modified output feature matrix of shape n-by-m
-# cmin Matrix --- Colunm minima of shape 1-by-m
-# cmax Matrix --- Column maxima of shape 1-by-m
-# ------------------------------------------------------------------------------
-
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix of shape n-by-m
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Modified output feature matrix of shape n-by-m
+# cmin Matrix[Double] Colunm minima of shape 1-by-m
+# cmax Matrix[Double] Column maxima of shape 1-by-m
+# ----------------------------------------------------------------------------------------------------------------------
m_normalize = function(Matrix[Double] X)
return (Matrix[Double] Y, Matrix[Double] cmin, Matrix[Double] cmax)
diff --git a/scripts/builtin/normalizeApply.dml b/scripts/builtin/normalizeApply.dml
index 07fad33..e63f65a 100644
--- a/scripts/builtin/normalizeApply.dml
+++ b/scripts/builtin/normalizeApply.dml
@@ -24,14 +24,20 @@
# preserves the input sparsity. The validity of the provided min-max range
# and post-processing is under control of the caller.
#
+# INPUT PARAMETERS:
# ------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix of shape n-by-m
-# cmin Matrix --- Colunm minima of shape 1-by-m
-# cmax Matrix --- Column maxima of shape 1-by-m
+# X Matrix[Double] --- Input feature matrix of shape n-by-m
+# cmin Matrix[Double] --- Colunm minima of shape 1-by-m
+# cmax Matrix[Double] --- Column maxima of shape 1-by-m
# ------------------------------------------------------------------------------
-# Y Matrix --- Modified output feature matrix of shape n-by-m
+#
+# OUTPUT:
+# ------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ------------------------------------------------------------------------------
+# Y Matrix[Double] Modified output feature matrix of shape n-by-m
# ------------------------------------------------------------------------------
diff --git a/scripts/builtin/outlier.dml b/scripts/builtin/outlier.dml
index 895e738..1026052 100644
--- a/scripts/builtin/outlier.dml
+++ b/scripts/builtin/outlier.dml
@@ -19,6 +19,25 @@
#
#-------------------------------------------------------------
+# This outlier-function takes a matrix data set as input from where it determines
+# which point(s) have the largest difference from mean.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of Recoded dataset for outlier evaluation
+# opposite Boolean --- (1)TRUE for evaluating outlier from upper quartile range,
+# (0)FALSE for evaluating outlier from lower quartile range
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] matrix indicating outlier values
+# ----------------------------------------------------------------------------------------------------------------------
+
m_outlier = function(Matrix[Double] X, Boolean opposite) return (Matrix[Double] Y) {
# determine if largest value has largest diff from mean
I = (colMaxs(X)-colMeans(X)) > (colMeans(X)-colMins(X));
diff --git a/scripts/builtin/outlierByArima.dml b/scripts/builtin/outlierByArima.dml
index 8c3ca3a..8142860 100644
--- a/scripts/builtin/outlierByArima.dml
+++ b/scripts/builtin/outlierByArima.dml
@@ -19,36 +19,34 @@
#
#-------------------------------------------------------------
-
-# Built-in function for detecting and repairing outliers in time series,
-# by training an ARIMA model and classifying values that are more than
-# k standard-deviations away from the predicated values as outliers.
+# Built-in function for detecting and repairing outliers in time series, by training an ARIMA model
+# and classifying values that are more than k standard-deviations away from the predicated values as outliers.
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix X
-# k Double 3 threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
-# repairMethod Integer 1 values: 0 = delete rows having outliers, 1 = replace outliers as zeros
-# 2 = replace outliers as missing values
-# p Int 0 non-seasonal AR order
-# d Int 0 non-seasonal differencing order
-# q Int 0 non-seasonal MA order
-# P Int 0 seasonal AR order
-# D Int 0 seasonal differencing order
-# Q Int 0 seasonal MA order
-# s Int 1 period in terms of number of time-steps
-# include_mean Bool FALSE
-# solver String "jacobi" solver, is either "cg" or "jacobi"
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X_corrected Double --- Matrix X with no outliers
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X
+# k Matrix[Double] 3 threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
+# repairMethod Integer 1 values: 0 = delete rows having outliers, 1 = replace outliers as zeros
+# 2 = replace outliers as missing values
+# p Int 0 non-seasonal AR order
+# d Int 0 non-seasonal differencing order
+# q Int 0 non-seasonal MA order
+# P Int 0 seasonal AR order
+# D Int 0 seasonal differencing order
+# Q Int 0 seasonal MA order
+# s Int 1 period in terms of number of time-steps
+# include_mean Bool FALSE
+# solver String "jacobi" solver, is either "cg" or "jacobi"
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X_corrected Matrix[Double] Matrix X with no outliers
+# ----------------------------------------------------------------------------------------------------------------------
m_outlierByArima = function(Matrix[Double] X, Double k = 3, Integer repairMethod = 1, Integer p=0, Integer d=0,
Integer q=0, Integer P=0, Integer D=0, Integer Q=0, Integer s=1, Boolean include_mean=FALSE, String solver="jacobi")
diff --git a/scripts/builtin/outlierByIQR.dml b/scripts/builtin/outlierByIQR.dml
index b394f69..27e2851 100644
--- a/scripts/builtin/outlierByIQR.dml
+++ b/scripts/builtin/outlierByIQR.dml
@@ -19,32 +19,32 @@
#
#-------------------------------------------------------------
-
# Builtin function for detecting and repairing outliers using standard deviation
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix X
-# k Double 1.5 a constant used to discern outliers k*IQR
-# isIterative Boolean TRUE iterative repair or single repair
-# repairMethod Integer 1 values: 0 = delete rows having outliers,
-# 1 = replace outliers with zeros
-# 2 = replace outliers as missing values
-# max_iterations Integer 0 values: 0 = arbitrary number of iteraition until all outliers are removed,
-# n = any constant defined by user
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Y Double --- Matrix X with no outliers
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X
+# k Double 1.5 a constant used to discern outliers k*IQR
+# isIterative Boolean TRUE iterative repair or single repair
+# repairMethod Integer 1 values: 0 = delete rows having outliers,
+# 1 = replace outliers with zeros
+# 2 = replace outliers as missing values
+# max_iterations Integer 0 values: 0 = arbitrary number of iteraition until all outliers are removed,
+# n = any constant defined by user
+# verbose Boolean FALSE flag specifying if logging information should be printed
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Matrix X with no outliers
+# ----------------------------------------------------------------------------------------------------------------------
-m_outlierByIQR = function(Matrix[Double] X, Double k =1.5, Integer repairMethod = 1,
- Integer max_iterations, Boolean verbose = TRUE) return(Matrix[Double] Y)
+m_outlierByIQR = function(Matrix[Double] X, Double k =1.5, Integer repairMethod = 1,
+ Integer max_iterations, Boolean verbose = TRUE) return(Matrix[Double] Y)
{
sumPrevious = as.double(0)
diff --git a/scripts/builtin/outlierBySd.dml b/scripts/builtin/outlierBySd.dml
index 9362dca..2705679 100644
--- a/scripts/builtin/outlierBySd.dml
+++ b/scripts/builtin/outlierBySd.dml
@@ -19,32 +19,30 @@
#
#-------------------------------------------------------------
-
-# Builtin function for detecting and repairing outliers using standard deviation
+# Builtin function for detecting and repairing outliers using standard deviation
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix X
-# k Double 3 threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
-# repairMethod Integer 1 values: 0 = delete rows having outliers, 1 = replace outliers as zeros
-# 2 = replace outliers as missing values
-# max_iterations Integer 0 values: 0 = arbitrary number of iteration until all outliers are removed,
-# n = any constant defined by user
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Y Double --- Matrix X with no outliers
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X
+# k Double 3 threshold values 1, 2, 3 for 68%, 95%, 99.7% respectively (3-sigma rule)
+# repairMethod Integer 1 values: 0 = delete rows having outliers, 1 = replace outliers as zeros
+# 2 = replace outliers as missing values
+# max_iterations Integer 0 values: 0 = arbitrary number of iteration until all outliers are removed,
+# n = any constant defined by user
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Matrix X with no outliers
+# ----------------------------------------------------------------------------------------------------------------------
m_outlierBySd = function(Matrix[Double] X, Double k = 3, Integer repairMethod = 1,
Integer max_iterations, Boolean verbose = TRUE) return(Matrix[Double] Y)
{
-
# variable initialization
sumPrevious = as.double(0)
sumNext = as.double(1)
diff --git a/scripts/builtin/pca.dml b/scripts/builtin/pca.dml
index 3054d4c..37d2a55 100644
--- a/scripts/builtin/pca.dml
+++ b/scripts/builtin/pca.dml
@@ -19,24 +19,27 @@
#
#-------------------------------------------------------------
-# Principal Component Analysis (PCA) for dimensionality reduction
-#
+# The function Principal Component Analysis (PCA) is used for dimensionality reduction
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix
-# K Int 2 Number of reduced dimensions (i.e., columns)
-# Center Boolean TRUE Indicates whether or not to center the feature matrix
-# Scale Boolean TRUE Indicates whether or not to scale the feature matrix
-
-# RETURN VALUES
-# ---------------------------------------------------------------------------------------------
-# Xout Matrix --- Output feature matrix with K columns
-# Mout Matrix --- Output dominant eigen vectors (can be used for projections)
-# Centering Matrix --- The column means of the input, subtracted to construct the PCA
-# ScaleFactor Matrix --- The Scaling of the values, to make each dimension same size.
-# ---------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# K Int 2 Number of reduced dimensions (i.e., columns)
+# Center Boolean TRUE Indicates whether or not to center the feature matrix
+# Scale Boolean TRUE Indicates whether or not to scale the feature matrix
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Xout Matrix[Double] Output feature matrix with K columns
+# Mout Matrix[Double] Output dominant eigen vectors (can be used for projections)
+# Centering Matrix[Double] The column means of the input, subtracted to construct the PCA
+# ScaleFactor Matrix[Double] The Scaling of the values, to make each dimension same size.
+# ----------------------------------------------------------------------------------------------------------------------
m_pca = function(Matrix[Double] X, Integer K=2, Boolean center=TRUE, Boolean scale=TRUE)
return (Matrix[Double] Xout, Matrix[Double] Mout, Matrix[Double] Centering, Matrix[Double] ScaleFactor)
diff --git a/scripts/builtin/pcaInverse.dml b/scripts/builtin/pcaInverse.dml
index 6c416a2..96983a3 100644
--- a/scripts/builtin/pcaInverse.dml
+++ b/scripts/builtin/pcaInverse.dml
@@ -20,19 +20,25 @@
#-------------------------------------------------------------
# Principal Component Analysis (PCA) for reconstruction of approximation of the original data.
-#
# This methods allows to reconstruct an approximation of the original matrix, and is usefull for
# calculating how much information is lost in the PCA.
#
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Input features that have PCA applied to them
-# Centering Matrix empty matrix The column means of the PCA model, subtracted to construct the PCA
-# ScaleFactor Matrix empty matrix The scaling of each dimension in the PCA model
-# ---------------------------------------------------------------------------------------------
-# Y Matrix --- Output feature matrix reconstructing and approximation of the original matrix
-# ---------------------------------------------------------------------------------------------
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] --- Input features that have PCA applied to them
+# Clusters Matrix[Double] --- The previous PCA components computed
+# Centering Matrix[Double] empty matrix The column means of the PCA model, subtracted to construct the PCA
+# ScaleFactor Matrix[Double] empty matrix The scaling of each dimension in the PCA model
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] Output feature matrix reconstructing and approximation of the original matrix
+----------------------------------------------------------------------------------------------------------------------
m_pcaInverse = function(Matrix[Double] Y, Matrix[Double] Clusters,
Matrix[Double] Centering = matrix(0, rows= 0, cols=0),
diff --git a/scripts/builtin/pcaTransform.dml b/scripts/builtin/pcaTransform.dml
index 429342d..b7588c8 100644
--- a/scripts/builtin/pcaTransform.dml
+++ b/scripts/builtin/pcaTransform.dml
@@ -20,19 +20,25 @@
#-------------------------------------------------------------
# Principal Component Analysis (PCA) for dimensionality reduction prediciton
-#
# This method is used to transpose data, which the PCA model was not trained on. To validate how good
# The PCA is, and to apply in production.
#
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix
-# Centering Matrix empty matrix The column means of the PCA model, subtracted to construct the PCA
-# ScaleFactor Matrix empty matrix The scaling of each dimension in the PCA model
-# ---------------------------------------------------------------------------------------------
-# Y Matrix --- Output feature matrix dimensionally reduced by PCA
-# ---------------------------------------------------------------------------------------------
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# Clusters Matrix[Double] --- The previously computed principal components
+# Centering Matrix[Double] empty matrix The column means of the PCA model, subtracted to construct the PCA
+# ScaleFactor Matrix[Double] empty matrix The scaling of each dimension in the PCA model
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Output feature matrix dimensionally reduced by PCA
+# ----------------------------------------------------------------------------------------------------------------------
m_pcaTransform = function(Matrix[Double] X, Matrix[Double] Clusters,
Matrix[Double] Centering = matrix(0, rows= 0, cols=0),
diff --git a/scripts/builtin/pnmf.dml b/scripts/builtin/pnmf.dml
index b31fd57..3213d85 100644
--- a/scripts/builtin/pnmf.dml
+++ b/scripts/builtin/pnmf.dml
@@ -19,11 +19,32 @@
#
#-------------------------------------------------------------
-# Implements Poisson Nonnegative Matrix Factorization (PNMF)
+# The pnmf-function implements Poisson Non-negative Matrix Factorization (PNMF). Matrix X is factorized into two
+# non-negative matrices, W and H based on Poisson probabilistic assumption. This non-negativity makes the resulting
+# matrices easier to inspect.
#
# [Chao Liu, Hung-chih Yang, Jinliang Fan, Li-Wei He, Yi-Min Wang:
# Distributed nonnegative matrix factorization for web-scale dyadic
# data analysis on mapreduce. WWW 2010: 681-690]
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors.
+# rnk Integer --- Number of components into which matrix X is to be factored.
+# eps Double 10^-8 Tolerance
+# maxi Integer 10 Maximum number of conjugate gradient iterations.
+# verbose Boolean TRUE If TRUE, 'iter' and 'obj' are printed.
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# W Matrix[Double] List of pattern matrices, one for each repetition.
+# H Matrix[Double] List of amplitude matrices, one for each repetition.
+# ----------------------------------------------------------------------------------------------------------------------
m_pnmf = function(Matrix[Double] X, Integer rnk, Double eps = 1e-8, Integer maxi = 10, Boolean verbose=TRUE)
return (Matrix[Double] W, Matrix[Double] H)
diff --git a/scripts/builtin/ppca.dml b/scripts/builtin/ppca.dml
index dfd7452..7d09b34 100644
--- a/scripts/builtin/ppca.dml
+++ b/scripts/builtin/ppca.dml
@@ -22,25 +22,27 @@
# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data.
# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed
# Platforms. Tarek Elgamal et.al.
-
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- n x m input feature matrix
-# k Integer --- indicates dimension of the new vector space constructed from eigen vectors
-# maxi Integer --- maximum number of iterations until convergence
-# tolobj Double 0.00001 objective function tolerance value to stop ppca algorithm
-# tolrecerr Double 0.02 reconstruction error tolerance value to stop the algorithm
-# verbose Boolen TRUE verbose debug output
-# ---------------------------------------------------------------------------------------------
-# OUTPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Xout Matrix --- Output feature matrix with K columns
-# Mout Matrix --- Output dominant eigen vectors (can be used for projections)
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- n x m input feature matrix
+# k Integer --- indicates dimension of the new vector space constructed from eigen vectors
+# maxi Integer --- maximum number of iterations until convergence
+# tolobj Double 0.00001 objective function tolerance value to stop ppca algorithm
+# tolrecerr Double 0.02 reconstruction error tolerance value to stop the algorithm
+# verbose Boolen TRUE verbose debug output
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Xout Matrix[Double] Output feature matrix with K columns
+# Mout Matrix[Double] Output dominant eigen vectors (can be used for projections)
+# ----------------------------------------------------------------------------------------------------------------------
m_ppca = function(Matrix[Double] X, Integer K=2, Integer maxi = 10,
Double tolobj = 0.00001, Double tolrecerr = 0.02, Boolean verbose = TRUE)
diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml
index 3975fbe..7b80cbc 100644
--- a/scripts/builtin/randomForest.dml
+++ b/scripts/builtin/randomForest.dml
@@ -19,51 +19,55 @@
#
#-------------------------------------------------------------
+# This script implement classification random forest with both scale and categorical features.
#
-# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND CATEGORICAL FEATURES
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded
+# Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded
+# R Matrix[Double] " " Matrix which for each feature in X contains the following information
+# - R[,1]: column ids TODO pass recorded and binned
+# - R[,2]: start indices
+# - R[,3]: end indices
+# If R is not provided by default all variables are assumed to be scale
+# bins Int 20 Number of equiheight bins per scale feature to choose thresholds
+# depth Int 25 Maximum depth of the learned tree
+# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added
+# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building
+# num_trees Int 10 Number of trees to be learned in the random forest model
+# subsamp_rate Double 1.0 Parameter controlling the size of each tree in the forest; samples are selected from a
+# Poisson distribution with parameter subsamp_rate (the default value is 1.0)
+# feature_subset Double 0.5 Parameter that controls the number of feature used as candidates for splitting at each tree node
+# as a power of number of features in the dataset;
+# by default square root of features (i.e., feature_subset = 0.5) are used at each tree node
+# impurity String "Gini" Impurity measure: entropy or Gini (the default)
+# ----------------------------------------------------------------------------------------------------------------------
#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Feature matrix X; note that X needs to be both recoded and dummy coded
-# Y Matrix --- Label matrix Y; note that Y needs to be both recoded and dummy coded
-# R Matrix " " Matrix which for each feature in X contains the following information
-# - R[,1]: column ids TODO pass recorded and binned
-# - R[,2]: start indices
-# - R[,3]: end indices
-# If R is not provided by default all variables are assumed to be scale
-# bins Int 20 Number of equiheight bins per scale feature to choose thresholds
-# depth Int 25 Maximum depth of the learned tree
-# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added
-# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building
-# num_trees Int 10 Number of trees to be learned in the random forest model
-# subsamp_rate Double 1.0 Parameter controlling the size of each tree in the forest; samples are selected from a
-# Poisson distribution with parameter subsamp_rate (the default value is 1.0)
-# feature_subset Double 0.5 Parameter that controls the number of feature used as candidates for splitting at each tree node
-# as a power of number of features in the dataset;
-# by default square root of features (i.e., feature_subset = 0.5) are used at each tree node
-# impurity String "Gini" Impurity measure: entropy or Gini (the default)
-# ---------------------------------------------------------------------------------------------
-
-# Output TYPE MEANING:
-# ---------------------------------------------------------------------------------------------
-# M Matrix Matrix M containing the learned tree, where each column corresponds to a node
-# in the learned tree and each row contains the following information:
-# M[1,j]: id of node j (in a complete binary tree)
-# M[2,j]: tree id to which node j belongs
-# M[3,j]: Offset (no. of columns) to left child of j
-# M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
-# M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# M Matrix[Double] Matrix M containing the learned tree, where each column corresponds to a node
+# in the learned tree and each row contains the following information:
+# M[1,j]: id of node j (in a complete binary tree)
+# M[2,j]: tree id to which node j belongs
+# M[3,j]: Offset (no. of columns) to left child of j
+# M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0
+# M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2
+# for categorical features,
# otherwise the label that leaf node j is supposed to predict
-# M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the size of the subset of values
+# M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the
+# size of the subset of values
# stored in rows 7,8,... if j is categorical
-# M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is compared to is stored at M[7,j] if the feature chosen for j is scale;
-# If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j
-# C Matrix Matrix C containing the number of times samples are chosen in each tree of the random forest
-# S_map Matrix Mappings from scale feature ids to global feature ids
-# C_map Matrix Mappings from categorical feature ids to global feature ids
-# -------------------------------------------------------------------------------------------
+# M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is
+# compared to is stored at M[7,j] if the feature chosen for j is scale;
+# If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j
+# C Matrix[Double] Matrix C containing the number of times samples are chosen in each tree of the random forest
+# S_map Matrix[Double] Mappings from scale feature ids to global feature ids
+# C_map Matrix[Double] Mappings from categorical feature ids to global feature ids
+# ----------------------------------------------------------------------------------------------------------------------
m_randomForest = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] R,
Integer bins = 20, Integer depth = 25, Integer num_leaf = 10, Integer num_samples = 3000,
diff --git a/scripts/builtin/scale.dml b/scripts/builtin/scale.dml
index 3ec4a54..00840ea 100644
--- a/scripts/builtin/scale.dml
+++ b/scripts/builtin/scale.dml
@@ -19,18 +19,25 @@
#
#-------------------------------------------------------------
-# Scale and center individual features in the input matrix (column wise.) using z-score to scale the values.
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix
-# Center Boolean TRUE Indicates whether or not to center the feature matrix
-# Scale Boolean TRUE Indicates whether or not to scale the feature matrix
-# ---------------------------------------------------------------------------------------------
-# Y Matrix --- Output feature matrix with K columns
-# ColMean Matrix --- The column means of the input, subtracted if Center was TRUE
-# ScaleFactor Matrix --- The Scaling of the values, to make each dimension have similar value ranges
-# ---------------------------------------------------------------------------------------------
+# This function scales and center individual features in the input matrix (column wise.) using z-score to scale the values.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# Center Boolean TRUE Indicates whether or not to center the feature matrix
+# Scale Boolean TRUE Indicates whether or not to scale the feature matrix
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Output feature matrix with K columns
+# ColMean Matrix[Double] The column means of the input, subtracted if Center was TRUE
+# ScaleFactor Matrix[Double] The Scaling of the values, to make each dimension have similar value ranges
+# ----------------------------------------------------------------------------------------------------------------------
m_scale = function(Matrix[Double] X, Boolean center, Boolean scale)
return (Matrix[Double] Y, Matrix[Double] ColMean, Matrix[Double] ScaleFactor)
diff --git a/scripts/builtin/scaleApply.dml b/scripts/builtin/scaleApply.dml
index 424bc1d..e592e48 100644
--- a/scripts/builtin/scaleApply.dml
+++ b/scripts/builtin/scaleApply.dml
@@ -19,16 +19,23 @@
#
#-------------------------------------------------------------
-# Scale and center individual features in the input matrix (column wise.) using the input matrices.
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix
-# Centering Matrix --- The column means to subtract from X (not done if empty)
-# ScaleFactor Matrix --- The column scaling to multiply with X (not done if empty)
-# ---------------------------------------------------------------------------------------------
-# Y Matrix --- Output feature matrix with K columns
-# ---------------------------------------------------------------------------------------------
+# This function scales and center individual features in the input matrix (column wise.) using the input matrices.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# Centering Matrix[Double] --- The column means to subtract from X (not done if empty)
+# ScaleFactor Matrix[Double] --- The column scaling to multiply with X (not done if empty)
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Output feature matrix with K columns
+# ----------------------------------------------------------------------------------------------------------------------
m_scaleApply = function(Matrix[Double] X, Matrix[Double] Centering, Matrix[Double] ScaleFactor)
return (Matrix[Double] Y)
diff --git a/scripts/builtin/selectByVarThresh.dml b/scripts/builtin/selectByVarThresh.dml
index a07ce35..66d06f1 100644
--- a/scripts/builtin/selectByVarThresh.dml
+++ b/scripts/builtin/selectByVarThresh.dml
@@ -19,6 +19,23 @@
#
#-------------------------------------------------------------
+# This function drops feature with <= thresh variance (by default drop constants).
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors.
+# thresh Double 0
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Xp Matrix[Double] Matrix of feature vectors with <= thresh variance.
+# ----------------------------------------------------------------------------------------------------------------------
+
m_selectByVarThresh = function(Matrix[Double] X, Double thresh = 0)
return (Matrix[Double] Xp, Matrix[Double] I)
{
diff --git a/scripts/builtin/setdiff.dml b/scripts/builtin/setdiff.dml
index df74591..3c7bdc8 100644
--- a/scripts/builtin/setdiff.dml
+++ b/scripts/builtin/setdiff.dml
@@ -20,21 +20,21 @@
#-------------------------------------------------------------
# Builtin function that implements difference operation on vectors
-
+#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
-# X Matrix --- input vector
+# X Matrix[Double] --- input vector
+# Y Matrix[Double] --- input vector
# ---------------------------------------------------------------------------------------------
-# Y Matrix --- input vector
+#
+# OUTPUT:
# ---------------------------------------------------------------------------------------------
-
-# Output(s)
+# NAME TYPE MEANING
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# R Matrix[Double] vector with all elements that are present in X but not in Y
# ---------------------------------------------------------------------------------------------
-# R Matrix --- vector with all elements that are present in X but not in Y
setdiff = function(Matrix[double] X, Matrix[double] Y)
return (matrix[double] R)
diff --git a/scripts/builtin/sherlock.dml b/scripts/builtin/sherlock.dml
index 53af20a..f82b02a 100644
--- a/scripts/builtin/sherlock.dml
+++ b/scripts/builtin/sherlock.dml
@@ -18,50 +18,58 @@
# under the License.
#
#-------------------------------------------------------------
-source("scripts/builtin/sherlockNet.dml") as sherlockNet
-# Implements training phase of Sherlock: A Deep Learning Approach to Semantic Data Type Detection
+# This function implements training phase of Sherlock: A Deep Learning Approach to Semantic Data Type Detection
#
# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
# 2019.]
-
+#
# Split feature matrix into four different feature categories and train neural networks on the
# respective single features. Then combine all trained features to train final neural network.
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X_train Matrix --- maxtrix of feature vectors
-# y_train Matrix --- matrix Y of class labels of semantic data type
-# ---------------------------------------------------------------------------------------------
-# cW Matrix --- weights (parameters) matrices for character distribtions
-# cb Matrix --- biases vectors for character distribtions
-# wW Matrix --- weights (parameters) matrices for word embeddings
-# wb Matrix --- biases vectors for word embeddings
-# pW Matrix --- weights (parameters) matrices for paragraph vectors
-# pb Matrix --- biases vectors for paragraph vectors
-# sW Matrix --- weights (parameters) matrices for global statistics
-# sb Matrix --- biases vectors for global statistics
-# fW Matrix --- weights (parameters) matrices for combining all trained features (final)
-# fb Matrix --- biases vectors for combining all trained features (final)
-# ---------------------------------------------------------------------------------------------
-
-m_sherlock = function(matrix[double] X_train, matrix[double] y_train)
- return (matrix[double] cW1, matrix[double] cb1,
- matrix[double] cW2, matrix[double] cb2,
- matrix[double] cW3, matrix[double] cb3,
- matrix[double] wW1, matrix[double] wb1,
- matrix[double] wW2, matrix[double] wb2,
- matrix[double] wW3, matrix[double] wb3,
- matrix[double] pW1, matrix[double] pb1,
- matrix[double] pW2, matrix[double] pb2,
- matrix[double] pW3, matrix[double] pb3,
- matrix[double] sW1, matrix[double] sb1,
- matrix[double] sW2, matrix[double] sb2,
- matrix[double] sW3, matrix[double] sb3,
- matrix[double] fW1, matrix[double] fb1,
- matrix[double] fW2, matrix[double] fb2,
- matrix[double] fW3, matrix[double] fb3) {
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X_train Matrix[Double] --- maxtrix of feature vectors
+# y_train Matrix[Double] --- matrix Y of class labels of semantic data type
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# cW Matrix[Double] weights (parameters) matrices for character distribtions
+# cb Matrix[Double] biases vectors for character distribtions
+# wW Matrix[Double] weights (parameters) matrices for word embeddings
+# wb Matrix[Double] biases vectors for word embeddings
+# pW Matrix[Double] weights (parameters) matrices for paragraph vectors
+# pb Matrix[Double] biases vectors for paragraph vectors
+# sW Matrix[Double] weights (parameters) matrices for global statistics
+# sb Matrix[Double] biases vectors for global statistics
+# fW Matrix[Double] weights (parameters) matrices for combining all trained features (final)
+# fb Matrix[Double] biases vectors for combining all trained features (final)
+# ----------------------------------------------------------------------------------------------------------------------
+
+source("scripts/builtin/sherlockNet.dml") as sherlockNet
+
+m_sherlock = function(Matrix[Double] X_train, Matrix[Double] y_train)
+ return (Matrix[Double] cW1, Matrix[Double] cb1,
+ Matrix[Double] cW2, Matrix[Double] cb2,
+ Matrix[Double] cW3, Matrix[Double] cb3,
+ Matrix[Double] wW1, Matrix[Double] wb1,
+ Matrix[Double] wW2, Matrix[Double] wb2,
+ Matrix[Double] wW3, Matrix[Double] wb3,
+ Matrix[Double] pW1, Matrix[Double] pb1,
+ Matrix[Double] pW2, Matrix[Double] pb2,
+ Matrix[Double] pW3, Matrix[Double] pb3,
+ Matrix[Double] sW1, Matrix[Double] sb1,
+ Matrix[Double] sW2, Matrix[Double] sb2,
+ Matrix[Double] sW3, Matrix[Double] sb3,
+ Matrix[Double] fW1, Matrix[Double] fb1,
+ Matrix[Double] fW2, Matrix[Double] fb2,
+ Matrix[Double] fW3, Matrix[Double] fb3) {
train_cols = ncol(X_train)
train_rows = nrow(X_train)
[cW1, cb1, cW2, cb2, cW3, cb3] = sherlockNet::train(X_train[1:train_rows, 224:1183], y_train, 300)
@@ -93,7 +101,7 @@ m_sherlock = function(matrix[double] X_train, matrix[double] y_train)
# ---------------------------------------------------------------------------------------------
# m_data Matrix --- transformed matrix containing X values
-transform_values = function(frame[string] data) return (matrix[double] m_data) {
+transform_values = function(frame[string] data) return (Matrix[Double] m_data) {
rows = nrow(data)
cols = ncol(data)
@@ -119,7 +127,7 @@ transform_values = function(frame[string] data) return (matrix[double] m_data) {
# ground_truth Matrix --- matrix containing ground truth in numerical representation
# meta_data String --- String contaning meta data of transformation encoding
-transform_encode_labels = function(frame[string] data, string transform_spec) return (matrix[double] ground_truth , frame[string] meta_data) {
+transform_encode_labels = function(frame[string] data, string transform_spec) return (Matrix[Double] ground_truth , frame[string] meta_data) {
rows = nrow(data)
cols = ncol(data)
@@ -148,7 +156,7 @@ transform_encode_labels = function(frame[string] data, string transform_spec) re
# ---------------------------------------------------------------------------------------------
# ground_truth Matrix --- matrix containing ground truth in numerical representation
-transform_apply_labels = function(frame[string] data, frame[string] meta_data, string transform_spec) return (matrix[double] ground_truth) {
+transform_apply_labels = function(frame[string] data, frame[string] meta_data, string transform_spec) return (Matrix[Double] ground_truth) {
#remove index row
rows = nrow(data)
cols = ncol(data)
@@ -174,8 +182,8 @@ transform_apply_labels = function(frame[string] data, frame[string] meta_data, s
# ---------------------------------------------------------------------------------------------
# ground_truth Matrix --- matrix containing ground truth in numerical representation
-transform_labels_to_ground_truth = function(matrix[double] data)
- return(matrix[double] ground_truth) {
+transform_labels_to_ground_truth = function(Matrix[Double] data)
+ return(Matrix[Double] ground_truth) {
rows = nrow(data)
ground_truth = matrix(0, rows=rows, cols=78)
diff --git a/scripts/builtin/sherlockNet.dml b/scripts/builtin/sherlockNet.dml
index b29b8dc..63edf82 100644
--- a/scripts/builtin/sherlockNet.dml
+++ b/scripts/builtin/sherlockNet.dml
@@ -18,6 +18,33 @@
# under the License.
#
#-------------------------------------------------------------
+
+# This function implements Neural Network for Sherlock: A Deep Learning Approach to Semantic Data Type Detection
+# Trains a 2 hidden layer softmax classifier.
+#
+# REFERENCE:
+# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
+# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
+# 2019.]
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X_train Matrix[Double] --- input data matrix, of shape (N, D)
+# y_train Matrix[Double] --- target matrix, of shape (N, K)
+# hidden_layer_neurons int number of neurons per hidden layer
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# W Matrix[Double] weights (parameters) matrix, of shape (D, M, 3).
+# b Matrix[Double] biases vector, of shape (1, M, 3).
+# ----------------------------------------------------------------------------------------------------------------------
+
+
source("nn/layers/affine.dml") as affine
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/dropout.dml") as dropout
@@ -26,25 +53,8 @@ source("nn/layers/softmax.dml")as softmax
source("nn/optim/adam.dml") as adam
source("scripts/staging/entity-resolution/primitives/evaluation.dml") as evaluation
-# Implements Neural Network for Sherlock: A Deep Learning Approach to Semantic Data Type Detection
-#
-# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
-# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
-# 2019.]
-
-# Trains a 2 hidden layer softmax classifier.
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X_train Matrix --- input data matrix, of shape (N, D)
-# y_train Matrix --- target matrix, of shape (N, K)
-# hidden_layer_neurons int number of neurons per hidden layer
-# ---------------------------------------------------------------------------------------------
-# W Matrix weights (parameters) matrix, of shape (D, M, 3).
-# b Matrix biases vector, of shape (1, M, 3).
-
-train = function(matrix[double] X_train, matrix[double] y_train, int hidden_layer_neurons)
- return (matrix[double] W1, matrix[double] b1, matrix[double] W2, matrix[double] b2, matrix[double] W3, matrix[double] b3) {
+train = function(Matrix[Double] X_train, Matrix[Double] y_train, int hidden_layer_neurons)
+ return (Matrix[Double] W1, Matrix[Double] b1, Matrix[Double] W2, Matrix[Double] b2, Matrix[Double] W3, Matrix[Double] b3) {
# Generate input data
N = nrow(X_train) # num examples
@@ -155,11 +165,11 @@ train = function(matrix[double] X_train, matrix[double] y_train, int hidden_laye
# ---------------------------------------------------------------------------------------------
# probs Matrix class probabilities of shape (N, K)
-predict = function(matrix[double] test_val,
- matrix[double] W1, matrix[double] b1,
- matrix[double] W2, matrix[double] b2,
- matrix[double] W3, matrix[double] b3)
- return (matrix[double] probs) {
+predict = function(Matrix[Double] test_val,
+ Matrix[Double] W1, Matrix[Double] b1,
+ Matrix[Double] W2, Matrix[Double] b2,
+ Matrix[Double] W3, Matrix[Double] b3)
+ return (Matrix[Double] probs) {
N = nrow(test_val)
K = ncol(W3) # num features
@@ -208,7 +218,7 @@ predict = function(matrix[double] test_val,
# precision double scalar precission, of shape (1)
# recall double scalar recall, of shape (1)
-eval = function(matrix[double] probs, matrix[double] Y)
+eval = function(Matrix[Double] probs, Matrix[Double] Y)
return (double loss, double accuracy, double f1, double precision, double recall) {
# Compute loss & accuracy
loss = cross_entropy_loss::forward(probs, Y)
diff --git a/scripts/builtin/sherlockPredict.dml b/scripts/builtin/sherlockPredict.dml
index 9db9251..8765c98 100644
--- a/scripts/builtin/sherlockPredict.dml
+++ b/scripts/builtin/sherlockPredict.dml
@@ -18,51 +18,58 @@
# under the License.
#
#-------------------------------------------------------------
-source("scripts/builtin/sherlockNet.dml") as sherlockNet
-
-# Implements prediction and evaluation phase of Sherlock:
-# A Deep Learning Approach to Semantic Data Type Detection
-#
-# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
-# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
-# 2019.]
+# This function implements prediction and evaluation phase of Sherlock:
# Split feature matrix into four different feature categories and predicting the class probability
# on the respective features. Then combine all predictions for final predicted probabilities.
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Matrix --- matrix of values which are to be classified
-# cW Matrix --- weights (parameters) matrices for character distribtions
-# cb Matrix --- biases vectors for character distribtions
-# wW Matrix --- weights (parameters) matrices for word embeddings
-# wb Matrix --- biases vectors for word embeddings
-# pW Matrix --- weights (parameters) matrices for paragraph vectors
-# pb Matrix --- biases vectors for paragraph vectors
-# sW Matrix --- weights (parameters) matrices for global statistics
-# sb Matrix --- biases vectors for global statistics
-# fW Matrix --- weights (parameters) matrices for combining all trained features (final)
-# fb Matrix --- biases vectors for combining all trained features (final)
-# ---------------------------------------------------------------------------------------------
-# probs Matrix class probabilities of shape (N, K)
+# A Deep Learning Approach to Semantic Data Type Detection.
+# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
+# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
+# 2019.]
+
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- matrix of values which are to be classified
+# cW Matrix[Double] --- weights (parameters) matrices for character distribtions
+# cb Matrix[Double] --- biases vectors for character distribtions
+# wW Matrix[Double] --- weights (parameters) matrices for word embeddings
+# wb Matrix[Double] --- biases vectors for word embeddings
+# pW Matrix[Double] --- weights (parameters) matrices for paragraph vectors
+# pb Matrix[Double] --- biases vectors for paragraph vectors
+# sW Matrix[Double] --- weights (parameters) matrices for global statistics
+# sb Matrix[Double] --- biases vectors for global statistics
+# fW Matrix[Double] --- weights (parameters) matrices for combining all trained features (final)
+# fb Matrix[Double] --- biases vectors for combining all trained features (final)
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# probs Matrix[Double] class probabilities of shape (N, K)
+# ----------------------------------------------------------------------------------------------------------------------
+
+source("scripts/builtin/sherlockNet.dml") as sherlockNet
-m_sherlockPredict = function(matrix[double] X,
- matrix[double] cW1, matrix[double] cb1,
- matrix[double] cW2, matrix[double] cb2,
- matrix[double] cW3, matrix[double] cb3,
- matrix[double] wW1, matrix[double] wb1,
- matrix[double] wW2, matrix[double] wb2,
- matrix[double] wW3, matrix[double] wb3,
- matrix[double] pW1, matrix[double] pb1,
- matrix[double] pW2, matrix[double] pb2,
- matrix[double] pW3, matrix[double] pb3,
- matrix[double] sW1, matrix[double] sb1,
- matrix[double] sW2, matrix[double] sb2,
- matrix[double] sW3, matrix[double] sb3,
- matrix[double] fW1, matrix[double] fb1,
- matrix[double] fW2, matrix[double] fb2,
- matrix[double] fW3, matrix[double] fb3)
- return (matrix[double] probs) {
+m_sherlockPredict = function(Matrix[Double] X,
+ Matrix[Double] cW1, Matrix[Double] cb1,
+ Matrix[Double] cW2, Matrix[Double] cb2,
+ Matrix[Double] cW3, Matrix[Double] cb3,
+ Matrix[Double] wW1, Matrix[Double] wb1,
+ Matrix[Double] wW2, Matrix[Double] wb2,
+ Matrix[Double] wW3, Matrix[Double] wb3,
+ Matrix[Double] pW1, Matrix[Double] pb1,
+ Matrix[Double] pW2, Matrix[Double] pb2,
+ Matrix[Double] pW3, Matrix[Double] pb3,
+ Matrix[Double] sW1, Matrix[Double] sb1,
+ Matrix[Double] sW2, Matrix[Double] sb2,
+ Matrix[Double] sW3, Matrix[Double] sb3,
+ Matrix[Double] fW1, Matrix[Double] fb1,
+ Matrix[Double] fW2, Matrix[Double] fb2,
+ Matrix[Double] fW3, Matrix[Double] fb3)
+ return (Matrix[Double] probs) {
rows = nrow(X)
@@ -89,7 +96,7 @@ m_sherlockPredict = function(matrix[double] X,
# precision double scalar precission, of shape (1)
# recall double scalar recall, of shape (1)
-eval = function(matrix[double] probs, matrix[double] Y)
+eval = function(Matrix[Double] probs, Matrix[Double] Y)
return (double loss, double accuracy, double f1, double precision, double recall) {
[loss, accuracy, f1, precision, recall] = sherlockNet::eval(probs, Y)
diff --git a/scripts/builtin/shortestPath.dml b/scripts/builtin/shortestPath.dml
index 7f04799..4aaf630 100644
--- a/scripts/builtin/shortestPath.dml
+++ b/scripts/builtin/shortestPath.dml
@@ -18,35 +18,39 @@
# under the License.
#
#-------------------------------------------------------------
-#
-# Computes the minimum distances (shortest-path) between a single
-# source vertex and every other vertex in the graph.
+
+# Computes the minimum distances (shortest-path) between a single source vertex and every other vertex in the graph.
#
# Grzegorz Malewicz, Matthew H. Austern, Aart J. C. Bilk,
# James C. Dehnert, Ikkan Horn, Naty Leiser and Grzegorz Czajkowski:
# Pregel: A System for Large-Scale Graph Processing
#
-#------------------------------------------------------------------------------
-# NAME TYPE MEANING
-# G MATRIX adjacency matrix of the labeled graph: Such graph can be directed
-# (G is symmetric) or undirected (G is not symmetric).
-# The values of G can be 0/1 (just specifying whether the nodes
-# are connected or not) or integer values (representing the weight
-# of the edges or the distances between nodes, 0 if not connected).
-#
-# maxi Integer Integer max number of iterations accepted (0 for FALSE, i.e.
-# max number of iterations not defined)
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# G Matrix[Double] --- adjacency matrix of the labeled graph: Such graph can be directed
+# (G is symmetric) or undirected (G is not symmetric).
+# The values of G can be 0/1 (just specifying whether the nodes
+# are connected or not) or integer values (representing the weight
+# of the edges or the distances between nodes, 0 if not connected).
+# maxi Integer 0 Integer max number of iterations accepted (0 for FALSE, i.e.
+# max number of iterations not defined)
+# sourceNode Integer node index to calculate the shortest paths to all other nodes.
+# verbose Boolean FALSE flag for verbose debug output
#
-# sourceNode Integer node index to calculate the shortest paths to all other nodes.
+# ----------------------------------------------------------------------------------------------------------------------
#
-# verbose Boolean flag for verbose debug output
-#------------------------------------------------------------------------------
-# C Matrix Output matrix (double) of minimum distances (shortest-path) between
-# vertices: The value of the ith row and the jth column of the output
-# matrix is the minimum distance shortest-path from vertex i to vertex j.
-# When the value of the minimum distance is infinity, the two nodes are
-# not connected.
-#------------------------------------------------------------------------------
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# C Matrix[Double] Output matrix (double) of minimum distances (shortest-path) between
+# vertices: The value of the ith row and the jth column of the output
+# matrix is the minimum distance shortest-path from vertex i to vertex j.
+# When the value of the minimum distance is infinity, the two nodes are
+# not connected.
+# ----------------------------------------------------------------------------------------------------------------------
m_shortestPath = function(Matrix[Double] G, Integer maxi = 0, Integer sourceNode, Boolean verbose = FALSE)
return (Matrix[Double] C)
diff --git a/scripts/builtin/sigmoid.dml b/scripts/builtin/sigmoid.dml
index e37ce0b..7f5c441 100644
--- a/scripts/builtin/sigmoid.dml
+++ b/scripts/builtin/sigmoid.dml
@@ -19,10 +19,27 @@
#
#-------------------------------------------------------------
+# The Sigmoid function is a type of activation function, and also defined as a squashing function which limit the
+# output to a range between 0 and 1, which will make these functions useful in the prediction of probabilities.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors.
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] 1-column matrix of weights.
+# ----------------------------------------------------------------------------------------------------------------------
+
m_sigmoid = function(Matrix[Double] X) return (Matrix[Double] Y) {
Y = 1 / (1 + exp(-X));
}
s_sigmoid = function(Double x) return (Double y) {
y = 1 / (1 + exp(-x));
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/slicefinder.dml b/scripts/builtin/slicefinder.dml
index 0eef59e..b745484 100644
--- a/scripts/builtin/slicefinder.dml
+++ b/scripts/builtin/slicefinder.dml
@@ -23,29 +23,34 @@
# ML model debugging technique for finding the top-k data slices where
# a trained models performs significantly worse than on the overall
# dataset. For a detailed description and experimental results, see:
+# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging.(SIGMOD 2021)
#
-# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast,
-# Linear-Algebra-based Slice Finding for ML Model Debugging.
-# In: SIGMOD 2021.
-
-#-------------------------------------------------------------
-# X Input matrix (integer encoded [1..v])
-# e error vector (classification accuracy, l2 norm, etc)
-# k top-K subsets / slices
-# maxL maximum level L (conjunctions of L predicates), 0 unlimited
-# minSup minimum support (min number of rows per slice)
-# alpha weight [0,1]: 0 only size, 1 only error
-# tpEval flag for task-parallel slice evaluation,
-# otherwise data-parallel
-# tpBlksz block size for task-parallel execution (num slices)
-# selFeat flag for removing one-hot-encoded features that don't satisfy
-# the initial minimum-support constraint and/or have zero error
-# verbose flag for verbose debug output
-# ------------------------------------------------------------
-# TK top-k slices (k x ncol(X) if successful)
-# TKC score, size, error of slices (k x 3)
-# D debug matrix, populated with enumeration stats if verbose
-# ------------------------------------------------------------
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Recoded dataset into Matrix
+# e Matrix[Double] --- Trained model
+# k Integer 1 Number of subsets required
+# maxL Integer maximum level L (conjunctions of L predicates), 0 unlimited
+# minSup Integer minimum support (min number of rows per slice)
+# alpha Double weight [0,1]: 0 only size, 1 only error
+# tpEval Boolean flag for task-parallel slice evaluation,
+# otherwise data-parallel
+# tpBlksz Integer block size for task-parallel execution (num slices)
+# selFeat Boolean flag for removing one-hot-encoded features that don't satisfy
+# the initial minimum-support constraint and/or have zero error
+# verbose Boolean flag for verbose debug output
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# TK Matrix[Double] top-k slices (k x ncol(X) if successful)
+# TKC Matrix[Double] score, size, error of slices (k x 3)
+# D Matrix[Double] debug matrix, populated with enumeration stats if verbose
+# ----------------------------------------------------------------------------------------------------------------------
m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, Int k = 4,
Int maxL = 0, Int minSup = 32, Double alpha = 0.5, Boolean tpEval = TRUE,
diff --git a/scripts/builtin/smote.dml b/scripts/builtin/smote.dml
index 9c24894..29d03ff 100644
--- a/scripts/builtin/smote.dml
+++ b/scripts/builtin/smote.dml
@@ -19,28 +19,28 @@
#
#-------------------------------------------------------------
-
# Builtin function for handing class imbalance using Synthetic Minority Over-sampling Technique (SMOTE)
# by Nitesh V. Chawla et. al. In Journal of Artificial Intelligence Research 16 (2002). 321–357
#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix of minority class samples
-# mask Double --- 0/1 mask vector where 0 represent numeric value and 1 represent categorical value
-# s Integer 25 Amount of SMOTE (percentage of oversampling), integral multiple of 100
-# k Integer 1 Number of nearest neighbour
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# Y Double --- Matrix of (N/100)-1 * nrow(X) synthetic minority class samples
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of minority class samples
+# mask Matrix[Double] --- 0/1 mask vector where 0 represent numeric value and 1 represent categorical value
+# s Integer 25 Amount of SMOTE (percentage of oversampling), integral multiple of 100
+# k Integer 1 Number of nearest neighbour
+# verbose Boolean FALSE if the algorithm should be verbose
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Matrix of (N/100)-1 * nrow(X) synthetic minority class samples
+# ----------------------------------------------------------------------------------------------------------------------
-m_smote = function(Matrix[Double] X, Matrix[Double] mask, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
+m_smote = function(Matrix[Double] X, Matrix[Double] mask, Integer s = 200, Integer k = 1, Boolean verbose = FALSE)
return (Matrix[Double] Y) {
if(s < 100 | (s%%100) != 0)
diff --git a/scripts/builtin/softmax.dml b/scripts/builtin/softmax.dml
index f9e110b..401532f 100644
--- a/scripts/builtin/softmax.dml
+++ b/scripts/builtin/softmax.dml
@@ -17,6 +17,23 @@
# specific language governing permissions and limitations
# under the License.
#
+#-------------------------------------------------------------
+
+# This is a softmax classifier,forward function Computes the forward pass for a softmax classifier.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# S Matrix[Double] --- Inputs of shape (N, D).
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# P Matrix[Double] Outputs of shape (N, D).
+# ----------------------------------------------------------------------------------------------------------------------
source("nn/layers/softmax.dml") as sm
diff --git a/scripts/builtin/split.dml b/scripts/builtin/split.dml
index c5c1066..78a51cb 100644
--- a/scripts/builtin/split.dml
+++ b/scripts/builtin/split.dml
@@ -19,21 +19,28 @@
#
#-------------------------------------------------------------
-# Split input data X and Y into contiguous or samples train/test sets
-# ------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix
-# Y Matrix --- Input Labels
-# f Double 0.7 Train set fraction [0,1]
-# cont Boolean TRUE contiuous splits, otherwise sampled
-# seed Integer -1 The seed to reandomly select rows in sampled mode
-# ------------------------------------------------------------------------------
-# Xtrain Matrix --- Train split of feature matrix
-# Xtest Matrix --- Test split of feature matrix
-# ytrain Matrix --- Train split of label matrix
-# ytest Matrix --- Test split of label matrix
-# ------------------------------------------------------------------------------
+# This function split input data X and Y into contiguous or samples train/test sets
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# Y Matrix[Double] --- Input Labels
+# f Double 0.7 Train set fraction [0,1]
+# cont Boolean TRUE contiuous splits, otherwise sampled
+# seed Integer -1 The seed to reandomly select rows in sampled mode
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Xtrain Matrix[Double] Train split of feature matrix
+# Xtest Matrix[Double] Test split of feature matrix
+# ytrain Matrix[Double] Train split of label matrix
+# ytest Matrix[Double] Test split of label matrix
+# ----------------------------------------------------------------------------------------------------------------------
m_split = function(Matrix[Double] X, Matrix[Double] Y, Double f=0.7, Boolean cont=TRUE, Integer seed=-1)
return (Matrix[Double] Xtrain, Matrix[Double] Xtest, Matrix[Double] Ytrain, Matrix[Double] Ytest)
diff --git a/scripts/builtin/splitBalanced.dml b/scripts/builtin/splitBalanced.dml
index 32b87d7..da314d9 100644
--- a/scripts/builtin/splitBalanced.dml
+++ b/scripts/builtin/splitBalanced.dml
@@ -19,24 +19,30 @@
#
#-------------------------------------------------------------
+# This functions split input data X and Y into contiguous balanced ratio
# Related to [SYSTEMDS-2902] dependency function for cleaning pipelines
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# Y Matrix[Double] --- Input Labels
+# f Double 0.7 Train set fraction [0,1]
+# verbose Boolean FALSE print available
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X_train Matrix[Double] Train split of feature matrix
+# X_test Matrix[Double] Test split of feature matrix
+# y_train Matrix[Double] Train split of label matrix
+# y_test Matrix[Double] Test split of label matrix
+# ----------------------------------------------------------------------------------------------------------------------
-# Split input data X and Y into contiguous balanced ratio
-# ------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ------------------------------------------------------------------------------
-# X Matrix --- Input feature matrix
-# Y Matrix --- Input Labels
-# f Double 0.7 Train set fraction [0,1]
-# verbose Boolean FALSE print available
-# ------------------------------------------------------------------------------
-# X_train Matrix --- Train split of feature matrix
-# X_test Matrix --- Test split of feature matrix
-# y_train Matrix --- Train split of label matrix
-# y_test Matrix --- Test split of label matrix
-# ------------------------------------------------------------------------------
-
-m_splitBalanced = function(Matrix[Double] X, Matrix[Double] Y, Double splitRatio, Boolean verbose)
+m_splitBalanced = function(Matrix[Double] X, Matrix[Double] Y, Double splitRatio = 0.7, Boolean verbose = FALSE)
return (Matrix[Double] X_train, Matrix[Double] y_train, Matrix[Double] X_test,
Matrix[Double] y_test)
{
diff --git a/scripts/builtin/stableMarriage.dml b/scripts/builtin/stableMarriage.dml
index e6293cd..3eb493c 100644
--- a/scripts/builtin/stableMarriage.dml
+++ b/scripts/builtin/stableMarriage.dml
@@ -1,5 +1,6 @@
#-------------------------------------------------------------
-## Licensed to the Apache Software Foundation (ASF) under one
+#
+# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
@@ -17,58 +18,66 @@
# under the License.
#
#-------------------------------------------------------------
-# THIS SCRIPT COMPUTES A SOLUTION FOR THE STABLE MARRIAGE PROBLEM
+
+# This script computes a solution for the stable marriage problem.
#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# P Matrix --- proposer matrix P.
-# It must be a square matrix with no zeros.
-#
-# A Matrix --- acceptor matrix A.
-# It must be a square matrix with no zeros.
-#
-# ordered Boolean TRUE If true, P and A are assumed to be ordered,
-# i.e. the leftmost value in a row is the most preferred partner's index.
-# i.e. the leftmost value in a row in P is the preference value for the acceptor with
-# index 1 and vice-versa (higher is better).
-# OUTPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-#result_matrix Matrix --- Result Matrix
-# If cell [i,j] is non-zero, it means that acceptor i has matched with proposer j.
-# Further, if cell [i,j] is non-zero, it holds the preference value that led to the match.
-#
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# P Matrix[Double] --- proposer matrix P.
+# It must be a square matrix with no zeros.
+# A Matrix[Double] --- acceptor matrix A.
+# It must be a square matrix with no zeros.
+# ordered Boolean TRUE If true, P and A are assumed to be ordered,
+# i.e. the leftmost value in a row is the most preferred partner's index.
+# i.e. the leftmost value in a row in P is the preference value for the acceptor with
+# index 1 and vice-versa (higher is better).
+# ----------------------------------------------------------------------------------------------------------------------
#
-# Proposers.mtx:
-# 2.0,1.0,3.0
-# 1.0,2.0,3.0
-# 1.0,3.0,2.0
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# result_matrix Matrix[Double] Result Matrix
+# If cell [i,j] is non-zero, it means that acceptor i has matched with
+# proposer j. Further, if cell [i,j] is non-zero, it holds the preference
+# value that led to the match.
+# Proposers.mtx:
+# 2.0,1.0,3.0
+# 1.0,2.0,3.0
+# 1.0,3.0,2.0
#
-# Since ordered=TRUE, this means that proposer 1 (row 1) likes acceptor 2 the most, followed by acceptor 1 and acceptor 3.
-# If ordered=FALSE, this would mean that proposer 1 (row 1) likes acceptor 3 the most (since the value at [1,3] is the row max),
-# followed by acceptor 1 (2.0 preference value) and acceptor 2 (1.0 preference value).
+# Since ordered=TRUE, this means that proposer 1 (row 1) likes acceptor 2
+# the most, followed by acceptor 1 and acceptor 3.
+# If ordered=FALSE, this would mean that proposer 1 (row 1) likes acceptor 3
+# the most (since the value at [1,3] is the row max),
+# followed by acceptor 1 (2.0 preference value) and acceptor 2 (1.0 preference value).
#
-# Acceptors.mtx:
-# 3.0,1.0,2.0
-# 2.0,1.0,3.0
-# 3.0,2.0,1.0
+# Acceptors.mtx:
+# 3.0,1.0,2.0
+# 2.0,1.0,3.0
+# 3.0,2.0,1.0
#
-# Since ordered=TRUE, this means that acceptor 1 (row 1) likes proposer 3 the most, followed by proposer 1 and proposer 2.
-# If ordered=FALSE, this would mean that acceptor 1 (row 1) likes proposer 1 the most (since the value at [1,1] is the row max),
-# followed by proposer 3 (2.0 preference value) and proposer 2 (1.0 preference value).
+# Since ordered=TRUE, this means that acceptor 1 (row 1) likes proposer 3
+# the most, followed by proposer 1 and proposer 2.
+# If ordered=FALSE, this would mean that acceptor 1 (row 1) likes proposer 1
+# the most (since the value at [1,1] is the row max),
+# followed by proposer 3 (2.0 preference value) and proposer 2
+# (1.0 preference value).
#
-# Output.mtx (assuming ordered=TRUE):
-# 0.0,0.0,3.0
-# 0.0,3.0,0.0
-# 1.0,0.0,0.0
+# Output.mtx (assuming ordered=TRUE):
+# 0.0,0.0,3.0
+# 0.0,3.0,0.0
+# 1.0,0.0,0.0
#
-# Acceptor 1 has matched with proposer 3 (since [1,3] is non-zero) at a preference level of 3.0.
-# Acceptor 2 has matched with proposer 2 (since [2,2] is non-zero) at a preference level of 3.0.
-# Acceptor 3 has matched with proposer 1 (since [3,1] is non-zero) at a preference level of 1.0.
-# --------------------------------------------------------------------------------------------
+# Acceptor 1 has matched with proposer 3 (since [1,3] is non-zero) at a
+# preference level of 3.0.
+# Acceptor 2 has matched with proposer 2 (since [2,2] is non-zero) at a
+# preference level of 3.0.
+# Acceptor 3 has matched with proposer 1 (since [3,1] is non-zero) at a
+# preference level of 1.0.
+# ----------------------------------------------------------------------------------------------------------------------
m_stableMarriage = function(Matrix[Double] P, Matrix[Double] A, Boolean ordered = TRUE, Boolean verbose = FALSE)
return (Matrix[Double] result_matrix)
@@ -151,4 +160,4 @@ m_stableMarriage = function(Matrix[Double] P, Matrix[Double] A, Boolean ordered
if(verbose)
print("Result: \n"+toString(result_matrix))
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/statsNA.dml b/scripts/builtin/statsNA.dml
index 5f58c56..5543221 100644
--- a/scripts/builtin/statsNA.dml
+++ b/scripts/builtin/statsNA.dml
@@ -19,28 +19,35 @@
#
#-------------------------------------------------------------
-# Print summary stats about the distribution of missing values in a univariate time series.
-# ------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ------------------------------------------------------------------------------
-# X Matrix --- Numeric Vector ('vector') object containing NAs
-# bins Integer 4 Split number for bin stats. Number of bins the time series gets
-# divided into. For each bin information about amount/percentage of
-# missing values is printed.
-# verbose Boolean TRUE Print detailed information.
-# For print_only = TRUE, the missing value stats are printed with
-# more information ("Stats for Bins" and "overview NA series").
-# ------------------------------------------------------------------------------
-# stats Matrix Double Column vector where each row correspond to following,
-# 1. Length of time series (including NAs)
-# 2. Number of Missing Values (NAs)
-# 3. Percentage of Missing Values (#2/#1)
-# 4. Number of Gaps (consisting of one or more consecutive NAs)
-# 5. Average Gap Size - Average size of consecutive NAs for the NA gaps
-# 6. Longest NA gap - Longest series of consecutive missing values
-# 7. Most frequent gap size - Most frequently occurring gap size
-# 8. Gap size accounting for most NAs
-# ------------------------------------------------------------------------------
+# The statsNA-function Print summary stats about the distribution of missing values in a univariate time series.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Numeric Vector ('vector') object containing NAs
+# bins Integer 4 Split number for bin stats. Number of bins the time series gets
+# divided into. For each bin information about amount/percentage of
+# missing values is printed.
+# verbose Boolean TRUE Print detailed information.
+# For print_only = TRUE, the missing value stats are printed with
+# more information ("Stats for Bins" and "overview NA series").
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# stats Matrix[Double] Column vector where each row correspond to following,
+# 1. Length of time series (including NAs)
+# 2. Number of Missing Values (NAs)
+# 3. Percentage of Missing Values (#2/#1)
+# 4. Number of Gaps (consisting of one or more consecutive NAs)
+# 5. Average Gap Size - Average size of consecutive NAs for the NA gaps
+# 6. Longest NA gap - Longest series of consecutive missing values
+# 7. Most frequent gap size - Most frequently occurring gap size
+# 8. Gap size accounting for most NAs
+# ----------------------------------------------------------------------------------------------------------------------
m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE)
return(Matrix[Double] stats)
diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml
index 800c2ca..f325770 100644
--- a/scripts/builtin/steplm.dml
+++ b/scripts/builtin/steplm.dml
@@ -19,48 +19,44 @@
#
#-------------------------------------------------------------
-
-#
-# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC
-# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y
-#
-# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X String --- Location (on HDFS) to read the matrix X of feature vectors
-# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values
-# B String --- Location to store estimated regression parameters (the betas)
-# S String --- Location to write the selected features ordered as computed by the algorithm
-# O String " " Location to write the printed statistics; by default is standard output
-# icpt Int 0 Intercept presence, shifting and rescaling the columns of X:
-# 0 = no intercept, no shifting, no rescaling;
-# 1 = add intercept, but neither shift nor rescale X;
-# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
-# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr
-# no further features are being checked and the algorithm stops
-# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv"
-# write_beta Boolean TRUE Should the beta's be returned?
-# 0 = no
-# 1 = yes
-# --------------------------------------------------------------------------------------------
+# The steplm-function (stepwise linear regression) implements a classical forward feature selection method.
+# This method iteratively runs what-if scenarios and greedily selects the next best feature
+# until the Akaike information criterion (AIC) does not improve anymore. Each configuration trains a regression model
+# via lm, which in turn calls either the closed form lmDS or iterative lmGC.
# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value:
# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B:
# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B
# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1]
# Col.2: betas for shifted/rescaled X and intercept
-#
# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated
# name-value pair per each line, as follows:
#
-# NAME MEANING
-# -------------------------------------------------------------------------------------
-# AVG_TOT_Y Average of the response value Y
-# STDEV_TOT_Y Standard Deviation of the response value Y
-# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias
-
-m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Location (on HDFS) to read the matrix X of feature vectors
+# Y Matrix[Double] --- Location (on HDFS) to read the 1-column matrix Y of response values
+# icpt Integer 0 Intercept presence, shifting and rescaling the columns of X:
+# 0 = no intercept, no shifting, no rescaling;
+# 1 = add intercept, but neither shift nor rescale X;
+# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1
+# reg Double 1e-7 learning rate
+# tol Double 1e-7 Tolerance threashold to train until achieved
+# maxi Integer 0 maximum iterations 0 means until tolerange is reached
+# verbose Boolean TRUE If the algorithm should be verbose
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# B Matrix[Double] Matrix of regression parameters (the betas) and its size depend on icpt input value.
+# S Matrix[Double] Matrix of selected features ordered as computed by the algorithm.
+# ----------------------------------------------------------------------------------------------------------------------
+
+m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0,
Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
return(Matrix[Double] B, Matrix[Double] S)
{
diff --git a/scripts/builtin/stratstats.dml b/scripts/builtin/stratstats.dml
index 13500cb..9fe45e6 100644
--- a/scripts/builtin/stratstats.dml
+++ b/scripts/builtin/stratstats.dml
@@ -19,68 +19,70 @@
#
#-------------------------------------------------------------
-
+# The stratstats.dml script computes common bivariate statistics, such as correlation, slope, and their p-value,
+# in parallel for many pairs of input variables in the presence of a confounding categorical variable.
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X Double --- Matrix X that has all 1-st covariates
-# Y Double " " Matrix Y that has all 2-nd covariates
-# the default value " " means "use X in place of Y"
-# S Double " " Matrix S that has the stratum column
-# the default value " " means "use X in place of S"
-# Xcid Double " " 1-st covariate X-column indices
-# the default value " " means "use columns 1 : ncol(X)"
-# Ycid Double " " 2-nd covariate Y-column indices
-# the default value " " means "use columns 1 : ncol(Y)"
-# Scid Int 1 Column index of the stratum column in S
-# ---------------------------------------------------------------------------------------------
-
-
-# OUTPUT MATRIX:
-# ---------------------------------------------------------------------------------------------
-# OutMtx Double --- Output matrix, one row per each distinct pair
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix X that has all 1-st covariates
+# Y Matrix[Double] empty Matrix Y that has all 2-nd covariates
+# the default value empty means "use X in place of Y"
+# S Matrix[Double] empty Matrix S that has the stratum column
+# the default value empty means "use X in place of S"
+# Xcid Matrix[Double] empty 1-st covariate X-column indices
+# the default value empty means "use columns 1 : ncol(X)"
+# Ycid Matrix[Double] empty 2-nd covariate Y-column indices
+# the default value empty means "use columns 1 : ncol(Y)"
+# Scid Int 1 Column index of the stratum column in S
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# OutMtx Matrix[Double] Output matrix, one row per each distinct pair
# (1st covariante, 2nd covariante)
# 40 columns containing the following information:
-# Col 01: 1st covariate X-column number
-# Col 02: 1st covariate global presence count
-# Col 03: 1st covariate global mean
-# Col 04: 1st covariate global standard deviation
-# Col 05: 1st covariate stratified standard deviation
-# Col 06: R-squared, 1st covariate vs. strata
-# Col 07: adjusted R-squared, 1st covariate vs. strata
-# Col 08: P-value, 1st covariate vs. strata
-# Col 09-10: Reserved
-# Col 11: 2nd covariate Y-column number
-# Col 12: 2nd covariate global presence count
-# Col 13: 2nd covariate global mean
-# Col 14: 2nd covariate global standard deviation
-# Col 15: 2nd covariate stratified standard deviation
-# Col 16: R-squared, 2nd covariate vs. strata
-# Col 17: adjusted R-squared, 2nd covariate vs. strata
-# Col 18: P-value, 2nd covariate vs. strata
-# Col 19-20: Reserved
-# Col 21: Global 1st & 2nd covariate presence count
-# Col 22: Global regression slope (2nd vs. 1st covariate)
-# Col 23: Global regression slope standard deviation
-# Col 24: Global correlation = +/- sqrt(R-squared)
-# Col 25: Global residual standard deviation
-# Col 26: Global R-squared
-# Col 27: Global adjusted R-squared
-# Col 28: Global P-value for hypothesis "slope = 0"
-# Col 29-30: Reserved
-# Col 31: Stratified 1st & 2nd covariate presence count
-# Col 32: Stratified regression slope (2nd vs. 1st covariate)
-# Col 33: Stratified regression slope standard deviation
-# Col 34: Stratified correlation = +/- sqrt(R-squared)
-# Col 35: Stratified residual standard deviation
-# Col 36: Stratified R-squared
-# Col 37: Stratified adjusted R-squared
-# Col 38: Stratified P-value for hypothesis "slope = 0"
-# Col 39: Number of strata with at least two counted points
-# Col 40: Reserved
-# ---------------------------------------------------------------------------------------------
-
+# Col 01: 1st covariate X-column number
+# Col 02: 1st covariate global presence count
+# Col 03: 1st covariate global mean
+# Col 04: 1st covariate global standard deviation
+# Col 05: 1st covariate stratified standard deviation
+# Col 06: R-squared, 1st covariate vs. strata
+# Col 07: adjusted R-squared, 1st covariate vs. strata
+# Col 08: P-value, 1st covariate vs. strata
+# Col 09-10: Reserved
+# Col 11: 2nd covariate Y-column number
+# Col 12: 2nd covariate global presence count
+# Col 13: 2nd covariate global mean
+# Col 14: 2nd covariate global standard deviation
+# Col 15: 2nd covariate stratified standard deviation
+# Col 16: R-squared, 2nd covariate vs. strata
+# Col 17: adjusted R-squared, 2nd covariate vs. strata
+# Col 18: P-value, 2nd covariate vs. strata
+# Col 19-20: Reserved
+# Col 21: Global 1st & 2nd covariate presence count
+# Col 22: Global regression slope (2nd vs. 1st covariate)
+# Col 23: Global regression slope standard deviation
+# Col 24: Global correlation = +/- sqrt(R-squared)
+# Col 25: Global residual standard deviation
+# Col 26: Global R-squared
+# Col 27: Global adjusted R-squared
+# Col 28: Global P-value for hypothesis "slope = 0"
+# Col 29-30: Reserved
+# Col 31: Stratified 1st & 2nd covariate presence count
+# Col 32: Stratified regression slope (2nd vs. 1st covariate)
+# Col 33: Stratified regression slope standard deviation
+# Col 34: Stratified correlation = +/- sqrt(R-squared)
+# Col 35: Stratified residual standard deviation
+# Col 36: Stratified R-squared
+# Col 37: Stratified adjusted R-squared
+# Col 38: Stratified P-value for hypothesis "slope = 0"
+# Col 39: Number of strata with at least two counted points
+# Col 40: Reserved
+# ----------------------------------------------------------------------------------------------------------------------
m_stratstats = function(Matrix[Double] X, Matrix[Double] Y = matrix(0.0, rows=1,cols=1),
Matrix[Double] S = matrix(0.0, rows=1,cols=1), Matrix[Double] Xcid = matrix(0.0, rows=1,cols=1),
diff --git a/scripts/builtin/symmetricDifference.dml b/scripts/builtin/symmetricDifference.dml
index 77f1a92..dc18386 100644
--- a/scripts/builtin/symmetricDifference.dml
+++ b/scripts/builtin/symmetricDifference.dml
@@ -20,22 +20,21 @@
#-------------------------------------------------------------
# Builtin function that implements symmetric difference set-operation on vectors
-
+#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
-# X Matrix --- input vector
+# X Matrix[Double] --- input vector
+# Y Matrix[Double] --- input vector
# ---------------------------------------------------------------------------------------------
-# Y Matrix --- input vector
+#
+# OUTPUT:
# ---------------------------------------------------------------------------------------------
-
-# Output(s)
+# NAME TYPE MEANING
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# R Matrix[Double] vector with all elements in X and Y but not in both
# ---------------------------------------------------------------------------------------------
-# R Matrix --- vector with all elements in X and Y but not in both
-
symmetricDifference = function(Matrix[Double] X, Matrix[Double] Y)
return (matrix[double] R)
diff --git a/scripts/builtin/tSNE.dml b/scripts/builtin/tSNE.dml
index 3e7926a..e9ab7d5 100644
--- a/scripts/builtin/tSNE.dml
+++ b/scripts/builtin/tSNE.dml
@@ -21,30 +21,29 @@
# This function performs dimensionality reduction using tSNE algorithm based on
# the paper: Visualizing Data using t-SNE, Maaten et. al.
-
-# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Double --- Data Matrix of shape
-# (number of data points, input dimensionality)
-# reduced_dims Integer 2 Output dimensionality
-# perplexity Integer 30 Perplexity Parameter
-# lr Double 300. Learning rate
-# momentum Double 0.9 Momentum Parameter
-# max_iter Integer 1000 Number of iterations
-# seed Integer -1 The seed used for initial values.
-# If set to -1 random seeds are selected.
-# is_verbose Boolean FALSE Print debug information
#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Data Matrix of shape
+# (number of data points, input dimensionality)
+# reduced_dims Integer 2 Output dimensionality
+# perplexity Integer 30 Perplexity Parameter
+# lr Double 300. Learning rate
+# momentum Double 0.9 Momentum Parameter
+# max_iter Integer 1000 Number of iterations
+# seed Integer -1 The seed used for initial values.
+# If set to -1 random seeds are selected.
+# is_verbose Boolean FALSE Print debug information
+# ----------------------------------------------------------------------------------------------------------------------
#
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# Y Matrix --- Data Matrix of shape (number of data points, reduced_dims)
-# ----------------------------------------------------------------------------
-
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Data Matrix of shape (number of data points, reduced_dims)
+# ----------------------------------------------------------------------------------------------------------------------
m_tSNE = function(Matrix[Double] X, Integer reduced_dims = 2, Integer perplexity = 30,
Double lr = 300., Double momentum = 0.9, Integer max_iter = 1000, Integer seed = -1, Boolean is_verbose = FALSE)
diff --git a/scripts/builtin/toOneHot.dml b/scripts/builtin/toOneHot.dml
index 8134f5c..fd76e89 100644
--- a/scripts/builtin/toOneHot.dml
+++ b/scripts/builtin/toOneHot.dml
@@ -19,24 +19,25 @@
#
#-------------------------------------------------------------
-# One-hot encodes a vector
-
+# The toOneHot-function encodes unordered categorical vector to multiple binarized vectors.
+#
# INPUT PARAMETERS:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# --------------------------------------------------------------------------------------------
-# X matrix --- vector with N integer entries between 1 and numClasses
-# numclasses int --- number of columns, must be >= largest value in X
-
-# Output:
-# --------------------------------------------------------------------------------------------
-# NAME TYPE MEANING
-# -------------------------------------------------------------------------------------------
-# Y matrix one-hot-encoded matrix with shape (N, numClasses)
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Vector with N integer entries between 1 and numClasses
+# numclasses int --- Number of columns, must be be greater than or equal to largest value in X
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] One-hot-encoded matrix with shape (N, numClasses)
+# ----------------------------------------------------------------------------------------------------------------------
-m_toOneHot = function(matrix[double] X, integer numClasses)
- return (matrix[double] Y) {
+m_toOneHot = function(Matrix[Double] X, integer numClasses)
+ return (Matrix[Double] Y) {
if(numClasses < max(X))
stop("numClasses must be >= largest value in X to prevent cropping");
Y = table(seq(1, nrow(X)), X, nrow(X), numClasses);
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index 6169dbf..6f1db03 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -18,23 +18,27 @@
# under the License.
#
#-------------------------------------------------------------
-#
-# UNDERSAMPLING TECHNIQUE;
-# COMPUTES TOMEK LINKS AND DROPS THEM FROM DATA MATRIX AND LABEL VECTOR
-# DROPS ONLY THE MAJORITY LABEL AND CORRESPONDING POINT OF TOMEK LINKS
+
+# The tomekLink-function performs undersampling by removing Tomek's links for imbalanced multiclass problems
+# Computes TOMEK links and drops them from data matrix and label vector.
+# Drops only the majarity label and corresponding point of TOMEK links.
#
# INPUT PARAMETERS:
-# ------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ------------------------------------------------------------
-# X MATRIX --- Data Matrix (nxm)
-# y MATRIX --- Label Matrix (nx1), greater than zero
-# ------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Data Matrix (nxm)
+# y Matrix[Double] --- Label Matrix (nx1), greater than zero
+# ----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# X_under - Data Matrix without Tomek links
-# y_under - Labels corresponding to undersampled data
-# drop_idx - Indices of dropped rows/labels wrt input
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X_under Matrix[Double] Data Matrix without Tomek links
+# y_under Matrix[Double] Labels corresponding to undersampled data
+# drop_idx Matrix[Double] Indices of dropped rows/labels wrt input
+# ----------------------------------------------------------------------------------------------------------------------
m_tomeklink = function(Matrix[Double] X, Matrix[Double] y)
return (Matrix[Double] X_under, Matrix[Double] y_under, Matrix[Double] drop_idx)
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
index f2a4f68..e9aebaf 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -18,6 +18,39 @@
# under the License.
#
#-------------------------------------------------------------
+
+# This function cleans top-K item (where K is given as input)for a given list of users.
+# metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, metaData[3] stores FD mask
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# dataTrain Frame[Unknown] ---
+# dataTest Frame[Unknown] NULL
+# metaData Frame[Unknown] NULL
+# primitives Frame[Unknown] ---
+# parameters Frame[Unknown] ---
+# cmr Matrix[Double] Matrix
+# evaluationFunc String ---
+# evalFunHp Matrix[Double] ---
+# topK Integer 5
+# resource_val Integer 20
+# sample Double 0.1
+# cv Boolean TRUE
+# cvk Integer 2
+# isLastLabel Boolean TRUE
+# correctTypos Boolean FALSE
+# output String ---
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# perf Boolean
+# ----------------------------------------------------------------------------------------------------------------------
+
# metaData[3, ncol(X)] : metaData[1] stores mask, metaData[2] stores schema, metaData[3] stores FD mask
source("scripts/pipelines/scripts/utils.dml") as utils;
source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
diff --git a/scripts/builtin/underSampling.dml b/scripts/builtin/underSampling.dml
index 88404f8..9dfae3c 100644
--- a/scripts/builtin/underSampling.dml
+++ b/scripts/builtin/underSampling.dml
@@ -18,7 +18,24 @@
# under the License.
#
#-------------------------------------------------------------
-# # # following built-in performs random under sampling on data
+
+# Builtin to perform random under sampling on data.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- X data to sample from
+# Y Matrix[Double] --- Y data to sample from it will sample the same rows from x.
+# ratio Double --- The ratio to sample
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# data Matrix[Double] The cbinded data of X and Y
+# ----------------------------------------------------------------------------------------------------------------------
m_underSampling = function(Matrix[Double] X, Matrix[Double] Y, Double ratio)
return(Matrix[Double] data)
diff --git a/scripts/builtin/union.dml b/scripts/builtin/union.dml
index b75e093..73ce3c4 100644
--- a/scripts/builtin/union.dml
+++ b/scripts/builtin/union.dml
@@ -20,22 +20,21 @@
#-------------------------------------------------------------
# Builtin function that implements union operation on vectors
-
+#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Matrix --- input vector
-# ---------------------------------------------------------------------------------------------
# Y Matrix --- input vector
# ---------------------------------------------------------------------------------------------
-
-# Output(s)
+#
+# OUTPUT:
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE MEANING
+# ---------------------------------------------------------------------------------------------
+# R Matrix matrix with all unique rows existing in X and Y
# ---------------------------------------------------------------------------------------------
-# R Matrix --- matrix with all unique rows existing in X and Y
-
union = function(Matrix[Double] X, Matrix[Double] Y)
return (matrix[double] R)
diff --git a/scripts/builtin/unique.dml b/scripts/builtin/unique.dml
index f4db3bf..ac40375 100644
--- a/scripts/builtin/unique.dml
+++ b/scripts/builtin/unique.dml
@@ -20,19 +20,20 @@
#-------------------------------------------------------------
# Builtin function that implements unique operation on vectors
-
+#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
-# X Matrix --- input vector
+# X Matrix[Double] --- input vector
# ---------------------------------------------------------------------------------------------
-
-# Output(s)
+#
+# OUTPUT:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
+# R Matrix[Double] matrix with only unique rows
# ---------------------------------------------------------------------------------------------
-# R Matrix --- matrix with only unique rows
unique = function(matrix[double] X)
return (matrix[double] R)
diff --git a/scripts/builtin/univar.dml b/scripts/builtin/univar.dml
index 80a4606..8d2ac00 100644
--- a/scripts/builtin/univar.dml
+++ b/scripts/builtin/univar.dml
@@ -18,18 +18,24 @@
# under the License.
#
#-------------------------------------------------------------
-#
+
# Computes univariate statistics for all attributes in a given data set
#
# INPUT PARAMETERS:
-# -------------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
-# -------------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
# X Matrix[Double] --- Input matrix of the shape (N, D)
# TYPES Matrix[Integer] --- Matrix of the shape (1, D) with features types:
# 1 for scale, 2 for nominal, 3 for ordinal
-# -------------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of summary statistics
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# univarStats Matrix[Double] univariate statistics for all attributes
+# ----------------------------------------------------------------------------------------------------------------------
m_univar = function(Matrix[Double] X, Matrix[Double] types)
return(Matrix[Double] univarStats)
diff --git a/scripts/builtin/vectorToCsv.dml b/scripts/builtin/vectorToCsv.dml
index cc54094..8f64053 100644
--- a/scripts/builtin/vectorToCsv.dml
+++ b/scripts/builtin/vectorToCsv.dml
@@ -19,25 +19,22 @@
#
#-------------------------------------------------------------
+# This builtin function convert vector into csv string such as [1 0 0 1 1 0 1] = "1,4,5,7"
# Related to [SYSTEMDS-2662] dependency function for cleaning pipelines
-
-# function to convert vector into csv string sunch as [1 0 0 1 1 0 1] = "1,4,5,7"
+#
# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# mask Double --- Data vector (having 0 for excluded indexes)
-# ---------------------------------------------------------------------------------------------
-
-
-#Output(s)
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# indexes Double --- string indexes
-
-
-
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# mask Matrix[Double] --- Data vector (having 0 for excluded indexes)
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# indexes String indexes
+# ----------------------------------------------------------------------------------------------------------------------
m_vectorToCsv = function(Matrix[Double] mask)
return (String indexes){
@@ -52,4 +49,4 @@ return (String indexes){
s = s+as.integer(as.scalar(vector[1,i]))+","
}
indexes = s+as.integer(as.scalar(vector[1,ncol(vector)]))
-}
\ No newline at end of file
+}
diff --git a/scripts/builtin/winsorize.dml b/scripts/builtin/winsorize.dml
index 614630d..d00e25e 100644
--- a/scripts/builtin/winsorize.dml
+++ b/scripts/builtin/winsorize.dml
@@ -19,6 +19,26 @@
#
#-------------------------------------------------------------
+# The winsorize-function removes outliers from the data. It does so by computing upper and
+# lower quartile range of the given data then it replaces any value that falls outside this range
+# (less than lower quartile range or more than upper quartile range).
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Input feature matrix
+# verbose Boolean FALSE To print output on screen
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# Y Matrix[Double] Matrix without outlier values
+# ----------------------------------------------------------------------------------------------------------------------
+
m_winsorize = function(Matrix[Double] X, Double ql = 0.05, Double qu = 0.95, Boolean verbose)
return (Matrix[Double] Y) {
diff --git a/scripts/builtin/xdummy1.dml b/scripts/builtin/xdummy1.dml
index 187cf73..5dbcbdd 100644
--- a/scripts/builtin/xdummy1.dml
+++ b/scripts/builtin/xdummy1.dml
@@ -17,6 +17,23 @@
# specific language governing permissions and limitations
# under the License.
#
+#-------------------------------------------------------------
+
+# This builtin function is here for debugging purposes
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------
+# X Matrix[Double] --- test input
+# ----------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------
+# Y Matrix[Double] test result
+# ----------------------------------------------------
m_xdummyTypo = function(Matrix[Double] X) return (Matrix[Double] Y) {
Y = cor(X);
diff --git a/scripts/builtin/xdummy2.dml b/scripts/builtin/xdummy2.dml
index 286ba82..9fe9ee0 100644
--- a/scripts/builtin/xdummy2.dml
+++ b/scripts/builtin/xdummy2.dml
@@ -17,6 +17,24 @@
# specific language governing permissions and limitations
# under the License.
#
+#-------------------------------------------------------------
+
+# This builtin function is here for debugging purposes
+#
+# INPUT PARAMETERS:
+# ------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ------------------------------------------------------
+# X Matrix[Double] --- Debug input
+# ------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------
+# NAME TYPE MEANING
+# ------------------------------------------------------
+# Y Matrix[Double] ---
+# Z Matrix[Double] ---
+# ------------------------------------------------------
m_xdummyTypo = function(Matrix[Double] X) return (Matrix[Double] Y, Matrix[Double] Z) {
Y = cor(X);
diff --git a/scripts/builtin/xgboost.dml b/scripts/builtin/xgboost.dml
index dfaef13..b0df6a3 100644
--- a/scripts/builtin/xgboost.dml
+++ b/scripts/builtin/xgboost.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,14 +19,16 @@
#
#-------------------------------------------------------------
-
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
+# XGBoost is a decision-tree-based ensemble Machine Learning algorithm that uses a gradient boosting. This xgboost
+# implementation supports classification and regression and is capable of working with categorical and scalar features.
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
# X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded
# Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded
-# R Matrix[Double] 1, 1xn Matrix R; 1xn vector which for each feature in X contains the following information
+# R Matrix[Double] Matrix Matrix R; 1xn vector which for each feature in X contains the following information
# - R[,1]: 1 (scalar feature)
# - R[,2]: 2 (categorical feature)
# Feature 1 is a scalar feature and features 2 is a categorical feature
@@ -36,21 +38,30 @@
# learning_rate Double 0.3 Alias: eta. After each boosting step the learning rate controls the weights of the new predictions
# max_depth Integer 6 Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit
# lambda Double 0.0 L2 regularization term on weights. Increasing this value will make model more conservative and reduce amount of leaves of a tree
-# ---------------------------------------------------------------------------------------------
-
-# ---------------------------------------------------------------------------------------------
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
# OUTPUT:
-# Matrix M where each column corresponds to a node in the learned tree (the first node is the init prediction) and each row contains the following information:
-# M[1,j]: id of node j (in a complete binary tree)
-# M[2,j]: tree id to which node j belongs
-# M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
-# M[4,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical)
-# that node j looks at if j is an internal node, otherwise 0
-# M[5,j]: Type of the feature that node j looks at if j is an internal node. if leaf = 0, if scalar = 1, if categorical = 2
-# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale,
-# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
-# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
-# -------------------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# M Matrix[Double] Matrix M where each column corresponds to a node in the learned tree
+# (the first node is the init prediction) and each row contains
+# the following information:
+# M[1,j]: id of node j (in a complete binary tree)
+# M[2,j]: tree id to which node j belongs
+# M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
+# M[4,j]: Feature index of the feature (scale feature id if the feature is
+# scale or categorical feature id if the feature is categorical)
+# that node j looks at if j is an internal node, otherwise 0
+# M[5,j]: Type of the feature that node j looks at if j is an internal node.
+# if leaf = 0, if scalar = 1, if categorical = 2
+# M[6:,j]: If j is an internal node: Threshold the example's feature value is
+# compared to is stored at M[6,j] if the feature chosen for j is scale,
+# otherwise if the feature chosen for j is categorical rows 6,7,... depict
+# the value subset chosen for j
+# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
+# ----------------------------------------------------------------------------------------------------------------------
m_xgboost = function(Matrix[Double] X, Matrix[Double] y,
Matrix[Double] R = matrix(1,rows=1,cols=nrow(X)), Integer sml_type = 1, Integer num_trees = 7,
diff --git a/scripts/builtin/xgboostPredictClassification.dml b/scripts/builtin/xgboostPredictClassification.dml
index 1102fff..19b4c8c 100644
--- a/scripts/builtin/xgboostPredictClassification.dml
+++ b/scripts/builtin/xgboostPredictClassification.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,24 +19,27 @@
#
#-------------------------------------------------------------
-
+# XGBoost is a decision-tree-based ensemble Machine Learning algorithm that uses a gradient boosting. This xgboost
+# implementation supports classification and is capable of working with categorical features.
+#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Matrix --- Matrix of feature vectors we want to predict (X_test)
-# M Matrix --- The model created at xgboost
-# learning_rate Double 0.3 the learning rate used in the model
-
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# P Matrix --- The predictions of the samples using the given xgboost model. (y_prediction)
-# ----------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors we want to predict (X_test)
+# M Matrix[Double] --- The model created at xgboost
+# learning_rate Matrix[Double] 0.3 The learning rate used in the model
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# P Matrix[Double] The predictions of the samples using the given xgboost model. (y_prediction)
+# ----------------------------------------------------------------------------------------------------------------------
-m_xgboostPredictClassification = function(Matrix[Double] X, Matrix[Double] M, Double learning_rate = 0.3
-) return (Matrix[Double] P) {
+m_xgboostPredictClassification = function(Matrix[Double] X, Matrix[Double] M, Double learning_rate = 0.3)
+ return (Matrix[Double] P) {
nr_trees = max(M[2,])
P = matrix(0, rows=nrow(X), cols=1)
diff --git a/scripts/builtin/xgboostPredictRegression.dml b/scripts/builtin/xgboostPredictRegression.dml
index 301807f..c0f3c7d 100644
--- a/scripts/builtin/xgboostPredictRegression.dml
+++ b/scripts/builtin/xgboostPredictRegression.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,21 +19,24 @@
#
#-------------------------------------------------------------
-
+# XGBoost is a decision-tree-based ensemble Machine Learning algorithm that uses a gradient boosting. This xgboost
+# implementation supports regression.
+#
# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# X Matrix --- Matrix of feature vectors we want to predict (X_test)
-# M Matrix --- The model created at xgboost
-# learning_rate Double 0.3 the learning rate used in the model
-
-# RETURN VALUES
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# P Matrix --- The predictions of the samples using the given xgboost model. (y_prediction)
-# ----------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# X Matrix[Double] --- Matrix of feature vectors we want to predict (X_test)
+# M Matrix[Double] --- The model created at xgboost
+# learning_rate Matrix[Double] 0.3 The learning rate used in the model
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME TYPE MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# P Matrix[Double] The predictions of the samples using the given xgboost model. (y_prediction)
+# ----------------------------------------------------------------------------------------------------------------------
m_xgboostPredictRegression = function(Matrix[Double] X, Matrix[Double] M, Double learning_rate = 0.3)
return (Matrix[Double] P)