You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:26 UTC
[50/51] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Cox-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Cox-predict.dml b/scripts/algorithms/Cox-predict.dml
index 6f2bec0..7d444fd 100644
--- a/scripts/algorithms/Cox-predict.dml
+++ b/scripts/algorithms/Cox-predict.dml
@@ -1,181 +1,181 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A COX PROPORTIONAL HAZARD REGRESSION MODEL TO A NEW (TEST) DATASET
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Location to read the input matrix containing the survival data with the following schema:
-# - X[,1]: timestamps
-# - X[,2]: whether an event occurred (1) or data is censored (0)
-# - X[,3:]: feature vectors (excluding the baseline columns) used for model fitting
-# RT String --- Location to read column matrix RT containing the (order preserving) recoded timestamps from X
-# M String --- Location to read matrix M containing the fitted Cox model with the following schema:
-# - M[,1]: betas
-# - M[,2]: exp(betas)
-# - M[,3]: standard error of betas
-# - M[,4]: Z
-# - M[,5]: p-value
-# - M[,6]: lower 100*(1-alpha)% confidence interval of betas
-# - M[,7]: upper 100*(1-alpha)% confidence interval of betas
-# Y String --- Location to read matrix Y used for prediction
-# COV String --- Location to read the variance-covariance matrix of the betas
-# MF String --- Location to read column indices of X excluding the baseline factors if available
-# P String --- Location to store matrix P containing the results of prediction
-# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- A matrix P with the following schema:
-# P[,1]: linear predictors relative to a baseline which contains the mean values for each feature
-# i.e., (Y[3:] - colMeans (X[3:])) %*% b
-# P[,2]: standard error of linear predictors
-# P[,3]: risk relative to a baseline which contains the mean values for each feature
-# i.e., exp ((Y[3:] - colMeans (X[3:])) %*% b)
-# P[,4]: standard error of risk
-# P[,5]: estimates of cumulative hazard
-# P[,6]: standard error of the estimates of cumulative hazard
-# -------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f cox-predict.dml -nvargs X=INPUT_DIR/X RT=INPUT_DIR/RT M=INPUT_DIR/M Y=INPUT_DIR/Y
-# COV=INTPUT_DIR/COV MF=INPUT_DIR/MF P=OUTPUT_DIR/P fmt=csv
-
-fileX = $X;
-fileRT = $RT;
-fileMF = $MF;
-fileY = $Y;
-fileM = $M;
-fileCOV = $COV;
-fileP = $P;
-
-# Default values of some parameters
-fmtO = ifdef ($fmt, "text"); # $fmt="text"
-
-X_orig = read (fileX);
-RT_X = read (fileRT);
-Y_orig = read (fileY);
-M = read (fileM);
-b = M[,1];
-COV = read (fileCOV);
-
-col_ind = read (fileMF);
-tab = table (col_ind, seq (1, nrow (col_ind)), ncol (Y_orig), nrow (col_ind));
-Y_orig = Y_orig %*% tab;
-
-
-# Y and X have the same dimensions and schema
-if (ncol (Y_orig) != ncol (X_orig)) {
- stop ("Y has a wrong number of columns!");
-}
-
-X = X_orig[,3:ncol (X_orig)];
-T_X = X_orig[,1];
-E_X = X_orig[,2];
-D = ncol (X);
-N = nrow (X);
-Y_orig = order (target = Y_orig, by = 1);
-Y = Y_orig[,3:ncol (X_orig)];
-T_Y = Y_orig[,1];
-
-col_means = colMeans (X);
-ones = matrix (1, rows = nrow (Y), cols = 1);
-Y_rel = Y - (ones %*% col_means);
-
-##### compute linear predictors
-LP = Y_rel %*% b;
-# compute standard error of linear predictors using the Delta method
-se_LP = diag(sqrt (Y_rel %*% COV %*% t(Y_rel)));
-
-##### compute risk
-R = exp (Y_rel %*% b);
-# compute standard error of risk using the Delta method
-se_R = diag(sqrt ((Y_rel * R) %*% COV %*% t(Y_rel * R))) / sqrt (exp (LP));
-
-##### compute estimates of cumulative hazard together with their standard errors:
-# 1. col contains cumulative hazard estimates
-# 2. col contains standard errors for cumulative hazard estimates
-
-d_r = aggregate (target = E_X, groups = RT_X, fn = "sum");
-e_r = aggregate (target = RT_X, groups = RT_X, fn = "count");
-Idx = cumsum (e_r);
-all_times = table (seq (1, nrow (Idx), 1), Idx) %*% T_X; # distinct event times
-
-event_times = removeEmpty (target = ppred (d_r, 0, ">") * all_times, margin = "rows");
-num_distinct_event = nrow (event_times);
-
-num_distinct = nrow (all_times); # no. of distinct timestamps censored or uncensored
-I_rev = table (seq (1, num_distinct, 1), seq (num_distinct, 1, -1));
-e_r_rev_agg = cumsum (I_rev %*% e_r);
-select = t (colSums (table (seq (1, num_distinct), e_r_rev_agg)));
-
-min_event_time = min (event_times);
-max_event_time = max (event_times);
-T_Y = T_Y + (min_event_time * ppred (T_Y, min_event_time, "<"));
-T_Y = T_Y + (max_event_time * ppred (T_Y, max_event_time, ">"));
-
-Ind = outer (T_Y, t (event_times), ">=");
-Ind = table (seq (1, nrow (T_Y)), rowIndexMax (Ind), nrow (T_Y), num_distinct_event);
-
-exp_Xb = exp (X %*% b);
-exp_Xb_agg = aggregate (target = exp_Xb, groups = RT_X, fn = "sum");
-exp_Xb_cum = I_rev %*% cumsum (I_rev %*% exp_Xb_agg);
-
-H0 = cumsum (removeEmpty (target = d_r / exp_Xb_cum, margin = "rows"));
-P1 = cumsum (removeEmpty (target = d_r / exp_Xb_cum ^ 2, margin = "rows"));
-X_exp_Xb = X * exp (X %*% b);
-
-I_rev_all = table (seq (1, N, 1), seq (N, 1, -1));
-X_exp_Xb_rev_agg = cumsum (I_rev_all %*% X_exp_Xb);
-X_exp_Xb_rev_agg = removeEmpty (target = X_exp_Xb_rev_agg * select, margin = "rows");
-X_exp_Xb_cum = I_rev %*% X_exp_Xb_rev_agg;
-P2 = cumsum (removeEmpty (target = (X_exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows"));
-
-exp_Yb = exp (Y %*% b);
-exp_Yb_2 = exp_Yb ^ 2;
-Y_exp_Yb = Y * exp (Y %*% b);
-
-# estimates of cumulative hazard
-H = exp_Yb * (Ind %*% H0);
-
-# term1
-term1 = exp_Yb_2 * (Ind %*% P1);
-
-# term2
-P3 = cumsum (removeEmpty (target = (exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows"));
-P4 = (Ind %*% P2) * exp_Yb;
-P5 = Y_exp_Yb * (Ind %*% P3);
-term2 = P4 - P5;
-
-# standard error of the estimates of cumulative hazard
-se_H = sqrt (term1 + rowSums((term2 %*% COV) * term2));
-
-# prepare output matrix
-P = matrix (0, rows = nrow (Y), cols = 6);
-P[,1] = LP;
-P[,2] = se_LP;
-P[,3] = R;
-P[,4] = se_R;
-P[,5] = H;
-P[,6] = se_H;
-write (P, fileP, format=fmtO);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A COX PROPORTIONAL HAZARD REGRESSION MODEL TO A NEW (TEST) DATASET
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X String --- Location to read the input matrix containing the survival data with the following schema:
+# - X[,1]: timestamps
+# - X[,2]: whether an event occurred (1) or data is censored (0)
+# - X[,3:]: feature vectors (excluding the baseline columns) used for model fitting
+# RT String --- Location to read column matrix RT containing the (order preserving) recoded timestamps from X
+# M String --- Location to read matrix M containing the fitted Cox model with the following schema:
+# - M[,1]: betas
+# - M[,2]: exp(betas)
+# - M[,3]: standard error of betas
+# - M[,4]: Z
+# - M[,5]: p-value
+# - M[,6]: lower 100*(1-alpha)% confidence interval of betas
+# - M[,7]: upper 100*(1-alpha)% confidence interval of betas
+# Y String --- Location to read matrix Y used for prediction
+# COV String --- Location to read the variance-covariance matrix of the betas
+# MF String --- Location to read column indices of X excluding the baseline factors if available
+# P String --- Location to store matrix P containing the results of prediction
+# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
+# ---------------------------------------------------------------------------------------------
+# OUTPUT:
+# 1- A matrix P with the following schema:
+# P[,1]: linear predictors relative to a baseline which contains the mean values for each feature
+# i.e., (Y[3:] - colMeans (X[3:])) %*% b
+# P[,2]: standard error of linear predictors
+# P[,3]: risk relative to a baseline which contains the mean values for each feature
+# i.e., exp ((Y[3:] - colMeans (X[3:])) %*% b)
+# P[,4]: standard error of risk
+# P[,5]: estimates of cumulative hazard
+# P[,6]: standard error of the estimates of cumulative hazard
+# -------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f cox-predict.dml -nvargs X=INPUT_DIR/X RT=INPUT_DIR/RT M=INPUT_DIR/M Y=INPUT_DIR/Y
+# COV=INTPUT_DIR/COV MF=INPUT_DIR/MF P=OUTPUT_DIR/P fmt=csv
+
+fileX = $X;
+fileRT = $RT;
+fileMF = $MF;
+fileY = $Y;
+fileM = $M;
+fileCOV = $COV;
+fileP = $P;
+
+# Default values of some parameters
+fmtO = ifdef ($fmt, "text"); # $fmt="text"
+
+X_orig = read (fileX);
+RT_X = read (fileRT);
+Y_orig = read (fileY);
+M = read (fileM);
+b = M[,1];
+COV = read (fileCOV);
+
+col_ind = read (fileMF);
+tab = table (col_ind, seq (1, nrow (col_ind)), ncol (Y_orig), nrow (col_ind));
+Y_orig = Y_orig %*% tab;
+
+
+# Y and X have the same dimensions and schema
+if (ncol (Y_orig) != ncol (X_orig)) {
+ stop ("Y has a wrong number of columns!");
+}
+
+X = X_orig[,3:ncol (X_orig)];
+T_X = X_orig[,1];
+E_X = X_orig[,2];
+D = ncol (X);
+N = nrow (X);
+Y_orig = order (target = Y_orig, by = 1);
+Y = Y_orig[,3:ncol (X_orig)];
+T_Y = Y_orig[,1];
+
+col_means = colMeans (X);
+ones = matrix (1, rows = nrow (Y), cols = 1);
+Y_rel = Y - (ones %*% col_means);
+
+##### compute linear predictors
+LP = Y_rel %*% b;
+# compute standard error of linear predictors using the Delta method
+se_LP = diag(sqrt (Y_rel %*% COV %*% t(Y_rel)));
+
+##### compute risk
+R = exp (Y_rel %*% b);
+# compute standard error of risk using the Delta method
+se_R = diag(sqrt ((Y_rel * R) %*% COV %*% t(Y_rel * R))) / sqrt (exp (LP));
+
+##### compute estimates of cumulative hazard together with their standard errors:
+# 1. col contains cumulative hazard estimates
+# 2. col contains standard errors for cumulative hazard estimates
+
+d_r = aggregate (target = E_X, groups = RT_X, fn = "sum");
+e_r = aggregate (target = RT_X, groups = RT_X, fn = "count");
+Idx = cumsum (e_r);
+all_times = table (seq (1, nrow (Idx), 1), Idx) %*% T_X; # distinct event times
+
+event_times = removeEmpty (target = ppred (d_r, 0, ">") * all_times, margin = "rows");
+num_distinct_event = nrow (event_times);
+
+num_distinct = nrow (all_times); # no. of distinct timestamps censored or uncensored
+I_rev = table (seq (1, num_distinct, 1), seq (num_distinct, 1, -1));
+e_r_rev_agg = cumsum (I_rev %*% e_r);
+select = t (colSums (table (seq (1, num_distinct), e_r_rev_agg)));
+
+min_event_time = min (event_times);
+max_event_time = max (event_times);
+T_Y = T_Y + (min_event_time * ppred (T_Y, min_event_time, "<"));
+T_Y = T_Y + (max_event_time * ppred (T_Y, max_event_time, ">"));
+
+Ind = outer (T_Y, t (event_times), ">=");
+Ind = table (seq (1, nrow (T_Y)), rowIndexMax (Ind), nrow (T_Y), num_distinct_event);
+
+exp_Xb = exp (X %*% b);
+exp_Xb_agg = aggregate (target = exp_Xb, groups = RT_X, fn = "sum");
+exp_Xb_cum = I_rev %*% cumsum (I_rev %*% exp_Xb_agg);
+
+H0 = cumsum (removeEmpty (target = d_r / exp_Xb_cum, margin = "rows"));
+P1 = cumsum (removeEmpty (target = d_r / exp_Xb_cum ^ 2, margin = "rows"));
+X_exp_Xb = X * exp (X %*% b);
+
+I_rev_all = table (seq (1, N, 1), seq (N, 1, -1));
+X_exp_Xb_rev_agg = cumsum (I_rev_all %*% X_exp_Xb);
+X_exp_Xb_rev_agg = removeEmpty (target = X_exp_Xb_rev_agg * select, margin = "rows");
+X_exp_Xb_cum = I_rev %*% X_exp_Xb_rev_agg;
+P2 = cumsum (removeEmpty (target = (X_exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows"));
+
+exp_Yb = exp (Y %*% b);
+exp_Yb_2 = exp_Yb ^ 2;
+Y_exp_Yb = Y * exp (Y %*% b);
+
+# estimates of cumulative hazard
+H = exp_Yb * (Ind %*% H0);
+
+# term1
+term1 = exp_Yb_2 * (Ind %*% P1);
+
+# term2
+P3 = cumsum (removeEmpty (target = (exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows"));
+P4 = (Ind %*% P2) * exp_Yb;
+P5 = Y_exp_Yb * (Ind %*% P3);
+term2 = P4 - P5;
+
+# standard error of the estimates of cumulative hazard
+se_H = sqrt (term1 + rowSums((term2 %*% COV) * term2));
+
+# prepare output matrix
+P = matrix (0, rows = nrow (Y), cols = 6);
+P[,1] = LP;
+P[,2] = se_LP;
+P[,3] = R;
+P[,4] = se_R;
+P[,5] = H;
+P[,6] = se_H;
+write (P, fileP, format=fmtO);
+
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Cox.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Cox.dml b/scripts/algorithms/Cox.dml
index 3cab74d..6da22ce 100644
--- a/scripts/algorithms/Cox.dml
+++ b/scripts/algorithms/Cox.dml
@@ -1,502 +1,502 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
-# The Breslow method is used for handling ties and the regression parameters
-# are computed using trust region newton method with conjugate gradient
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Location to read the input matrix X containing the survival data containing the following information
-# - 1: timestamps
-# - 2: whether an event occurred (1) or data is censored (0)
-# - 3: feature vectors
-# TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row)
-# F String " " Column indices of X as a column vector which are to be used for fitting the Cox model
-# R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing
-# the start and end indices of the factors in X
-# - R[,1]: start indices
-# - R[,2]: end indices
-# Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X;
-# in this case the start and end indices corresponding to the baseline level need to be the same;
-# if R is not provided by default all variables are considered to be continuous
-# M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted
-# Cox model (the betas), their standard errors, confidence intervals, and P-values
-# S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including
-# no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare;
-# by default is standard output
-# T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model;
-# by default is standard output
-# COV String --- Location to store the variance-covariance matrix of the betas
-# RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X
-# XO String --- Location to store sorted input matrix by the timestamps
-# MF String --- Location to store column indices of X excluding the baseline factors if available
-# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas
-# tol Double 0.000001 Tolerance ("epsilon")
-# moi Int 100 Max. number of outer (Newton) iterations
-# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
-# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
-# M[,1]: betas
-# M[,2]: exp(betas)
-# M[,3]: standard error of betas
-# M[,4]: Z
-# M[,5]: P-value
-# M[,6]: lower 100*(1-alpha)% confidence interval of betas
-# M[,7]: upper 100*(1-alpha)% confidence interval of betas
-#
-# Two log files containing a summary of some statistics of the fitted model:
-# 1- File S with the following format
-# - line 1: no. of observations
-# - line 2: no. of events
-# - line 3: log-likelihood
-# - line 4: AIC
-# - line 5: Rsquare (Cox & Snell)
-# - line 6: max possible Rsquare
-# 2- File T with the following format
-# - line 1: Likelihood ratio test statistic, degree of freedom, P-value
-# - line 2: Wald test statistic, degree of freedom, P-value
-# - line 3: Score (log-rank) test statistic, degree of freedom, P-value
-#
-# Additionally, the following matrices are stored (needed for prediction)
-# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
-# 2- Matrix XO which is matrix X with sorted timestamps
-# 3- Variance-covariance matrix of the betas COV
-# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
-# -------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R
-# M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT
-# XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv
-
-fileX = $X;
-fileTE = $TE;
-fileRT = $RT;
-fileMF = $MF;
-fileM = $M;
-fileXO = $XO;
-fileCOV = $COV;
-
-# Default values of some parameters
-fileF = ifdef ($F, " "); # $F=" "
-fileR = ifdef ($R, " "); # $R=" "
-fileS = ifdef ($S, " "); # $S=" "
-fileT = ifdef ($T, " "); # $T=" "
-fmtO = ifdef ($fmt, "text"); # $fmt="text"
-alpha = ifdef ($alpha, 0.05); # $alpha=0.05
-tol = ifdef ($tol, 0.000001); # $tol=0.000001;
-maxiter = ifdef ($moi, 100); # $moi=100;
-maxinneriter = ifdef ($mii, 0); # $mii=0;
-
-X_orig = read (fileX);
-
-TE = read (fileTE);
-if (fileF != " ") {
- F = read (fileF);
-}
-
-######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET
-
-if (fileR != " ") { # factors available
- R = read (fileR);
- if (ncol (R) != 2) {
- stop ("Matrix R has wrong dimensions!");
- }
- print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");
- # identify baseline columns to be removed from X_orig
- col_sum = colSums (X_orig);
- col_seq = t (seq(1, ncol (X_orig)));
- parfor (i in 1:nrow (R), check = 0) {
- start_ind = as.scalar (R[i,1]);
- end_ind = as.scalar (R[i,2]);
- baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
- col_seq[,baseline_ind] = 0;
- }
- ones = matrix (1, rows = nrow (F), cols = 1);
- F_filter = table (ones, F, 1, ncol (X_orig));
- F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
- TE_F = t(append (t (TE), F_filter));
-} else if (fileF != " ") { # all features scale
- TE_F = t(append (t (TE), t(F)));
-} else { # no features available
- TE_F = TE;
-}
-
-write (TE_F, fileMF, format = fmtO);
-
-X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));
-
-######## RECODING TIMESTAMPS PRESERVING THE ORDER
-print ("RECODING TIMESTAMPS...");
-
-N = nrow (X_orig);
-X_orig = order (target = X_orig, by = 1);
-Idx = matrix (1, rows = N, cols = 1);
-num_timestamps = 1;
-if (N == 1) {
- RT = matrix (1, rows = 1, cols = 1);
-} else {
- Idx[2:N,1] = ppred (X_orig[1:(N - 1),1], X_orig[2:N,1], "!=");
- num_timestamps = sum (Idx);
- A = removeEmpty (target = diag (Idx), margin = "cols");
- if (ncol (A) > 1) {
- A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
- B = cumsum (A);
- RT = B %*% seq(1, ncol(B));
- } else { # there is only one group
- RT = matrix (1, rows = N, cols = 1);
- }
-}
-E = X_orig[,2];
-
-print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
-
-######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
-# b: the regression betas
-# o: loss function value
-# g: loss function gradient
-# H: loss function Hessian
-# sb: shift of b in one iteration
-# so: shift of o in one iteration
-# r: CG residual = H %*% sb + g
-# d: CG direction vector
-# Hd: = H %*% d
-# c: scalar coefficient in CG
-# delta: trust region size
-# tol: tolerance value
-# i: outer (Newton) iteration count
-# j: inner (CG) iteration count
-
-# computing initial coefficients b (all initialized to 0)
-if (ncol (X_orig) > 2) {
- X = X_orig[,3:ncol(X_orig)];
- D = ncol (X);
- zeros_D = matrix (0, rows = D, cols = 1);
- b = zeros_D;
-}
-d_r = aggregate (target = E, groups = RT, fn = "sum");
-e_r = aggregate (target = RT, groups = RT, fn = "count");
-
-# computing initial loss function value o
-num_distinct = nrow (d_r); # no. of distinct timestamps
-e_r_rev_agg = cumsum (rev(e_r));
-d_r_rev = rev(d_r);
-o = sum (d_r_rev * log (e_r_rev_agg));
-o_init = o;
-if (ncol (X_orig) < 3) {
- loglik = -o;
- S_str = "no. of records " + N + " loglik " + loglik;
- if (fileS != " ") {
- write (S_str, fileS, format = fmtO);
- } else {
- print (S_str);
- }
- stop ("No features are selected!");
-}
-
-# computing initial gradient g
-# part 1 g0_1
-g0_1 = - t (colSums (X * E)); # g_1
-# part 2 g0_2
-X_rev_agg = cumsum (rev(X));
-select = table (seq (1, num_distinct), e_r_rev_agg);
-X_agg = select %*% X_rev_agg;
-g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
-#
-g0 = g0_1 + g0_2;
-g = g0;
-
-# initialization for trust region Newton method
-delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
-initial_g2 = sum (g ^ 2);
-exit_g2 = initial_g2 * tol ^ 2;
-maxiter = 100;
-maxinneriter = min (D, 100);
-i = 0;
-sum_g2 = sum (g ^ 2);
-while (sum_g2 > exit_g2 & i < maxiter) {
- i = i + 1;
- sb = zeros_D;
- r = g;
- r2 = sum (r ^ 2);
- exit_r2 = 0.01 * r2;
- d = - r;
- trust_bound_reached = FALSE;
- j = 0;
-
- exp_Xb = exp (X %*% b);
- exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
- D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
- X_exp_Xb = X * exp_Xb;
- X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
- X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
-
- while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
- j = j + 1;
- # computing Hessian times d (Hd)
- # part 1 Hd_1
- Xd = X %*% d;
- X_Xd_exp_Xb = X * (Xd * exp_Xb);
- X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
- X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
-
- Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
- # part 2 Hd_2
-
- Xd_exp_Xb = Xd * exp_Xb;
- Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
- Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
-
- Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
- Hd_2 = Hd_2_num / (D_r_rev ^ 2);
-
- Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
-
- c = r2 / sum (d * Hd);
- [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
- sb = sb + c * d;
- r = r + c * Hd;
- r2_new = sum (r ^ 2);
- d = - r + (r2_new / r2) * d;
- r2 = r2_new;
- }
-
- # computing loss change in 1 iteration (so)
- # part 1 so_1
- so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
- # part 2 so_2
- exp_Xbsb = exp (X %*% (b + sb));
- exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
- so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
- #
- so = so_1 + so_2;
- so = so - o;
-
- delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
- if (so < 0) {
- b = b + sb;
- o = o + so;
- # compute new gradient g
- exp_Xb = exp (X %*% b);
- exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
- X_exp_Xb = X * exp_Xb;
- X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
- X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
-
- D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
- g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
- g = g0_1 + g_2;
- sum_g2 = sum (g ^ 2);
- }
-}
-
-if (sum_g2 > exit_g2 & i >= maxiter) {
- print ("Trust region Newton method did not converge!");
-}
-
-
-print ("COMPUTING HESSIAN...");
-
-H0 = matrix (0, rows = D, cols = D);
-H = matrix (0, rows = D, cols = D);
-
-X_exp_Xb_rev_2 = rev(X_exp_Xb);
-X_rev_2 = rev(X);
-
-X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
-X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
-
-parfor (i in 1:D, check = 0) {
- Xi = X[,i];
- Xi_rev = rev(Xi);
-
- ## ----------Start calculating H0--------------
- # part 1 H0_1
- Xi_X = X_rev_2[,i:D] * Xi_rev;
- Xi_X_rev_agg = cumsum (Xi_X);
- Xi_X_rev_agg = select %*% Xi_X_rev_agg;
- H0_1 = Xi_X_rev_agg / e_r_rev_agg;
-
- # part 2 H0_2
- Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
- Xi_agg_rev_agg = cumsum (rev(Xi_agg));
- H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
- H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
-
- H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
- #-----------End calculating H0--------------------
-
- ## ----------Start calculating H--------------
- # part 1 H_1
- Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
- Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
- Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
- H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
-
- # part 2 H_2
- Xi_exp_Xb = exp_Xb * Xi;
- Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
-
- Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
- H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
- H_2 = H_2_num / (D_r_rev ^ 2);
- H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
- #-----------End calculating H--------------------
-}
-H = H + t(H) - diag( diag (H));
-H0 = H0 + t(H0) - diag( diag (H0));
-
-
-# compute standard error for betas
-H_inv = inv (H);
-se_b = sqrt (diag (H_inv));
-
-# compute exp(b), Z, Pr[>|Z|]
-exp_b = exp (b);
-Z = b / se_b;
-P = matrix (0, rows = D, cols = 1);
-parfor (i in 1:D) {
- P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal"));
-}
-
-# compute confidence intervals for b
-z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
-CI_l = b - se_b * z_alpha_2;
-CI_r = b - se_b + z_alpha_2;
-
-######## SOME STATISTICS AND TESTS
-# no. of records
-S_str = "no. of records " + N;
-
-# no.of events
-S_str = append (S_str, "no. of events " + sum (E));
-
-# log-likelihood
-loglik = -o;
-S_str = append (S_str, "loglik " + loglik + " ");
-
-# AIC = -2 * loglik + 2 * D
-AIC = -2 * loglik + 2 * D;
-S_str = append (S_str, "AIC " + AIC + " ");
-
-# Wald test
-wald_t = as.scalar (t(b) %*% H %*% b);
-wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
-T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
-
-# Likelihood ratio test
-lratio_t = 2 * o_init - 2 * o;
-lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
-T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " ");
-
-
-H0_inv = inv (H0);
-score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
-score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
-T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " ");
-
-# Rsquare (Cox & Snell)
-Rsquare = 1 - exp (-lratio_t / N);
-Rsquare_max = 1 - exp (-2 * o_init / N);
-S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
-S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
-
-M = matrix (0, rows = D, cols = 7);
-M[,1] = b;
-M[,2] = exp_b;
-M[,3] = se_b;
-M[,4] = Z;
-M[,5] = P;
-M[,6] = CI_l;
-M[,7] = CI_r;
-
-write (M, fileM, format = fmtO);
-if (fileS != " ") {
- write (S_str, fileS, format = fmtO);
-} else {
- print (S_str);
-}
-if (fileT != " ") {
- write (T_str, fileT, format = fmtO);
-} else {
- print (T_str);
-}
-# needed for prediction
-write (RT, fileRT, format = fmtO);
-write (H_inv, fileCOV, format = fmtO);
-write (X_orig, fileXO, format = fmtO);
-
-
-####### UDFS FOR TRUST REGION NEWTON METHOD
-
-ensure_trust_bound =
- function (double x, double a, double b, double c)
- return (double x_new, boolean is_violated)
-{
- if (a * x^2 + b * x + c > 0)
- {
- is_violated = TRUE;
- rad = sqrt (b ^ 2 - 4 * a * c);
- if (b >= 0) {
- x_new = - (2 * c) / (b + rad);
- } else {
- x_new = - (b - rad) / (2 * a);
- }
- } else {
- is_violated = FALSE;
- x_new = x;
- }
-}
-
-update_trust_bound =
- function (double delta,
- double sb_distance,
- double so_exact,
- double so_linear_approx,
- double so_quadratic_approx)
- return (double delta)
-{
- sigma1 = 0.25;
- sigma2 = 0.5;
- sigma3 = 4.0;
-
- if (so_exact <= so_linear_approx) {
- alpha = sigma3;
- } else {
- alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
- }
-
- rho = so_exact / so_quadratic_approx;
- if (rho < 0.0001) {
- delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
- } else { if (rho < 0.25) {
- delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
- } else { if (rho < 0.75) {
- delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
- } else {
- delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
- }}}
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
+# The Breslow method is used for handling ties and the regression parameters
+# are computed using trust region newton method with conjugate gradient
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X String --- Location to read the input matrix X containing the survival data containing the following information
+# - 1: timestamps
+# - 2: whether an event occurred (1) or data is censored (0)
+# - 3: feature vectors
+# TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row)
+# F String " " Column indices of X as a column vector which are to be used for fitting the Cox model
+# R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing
+# the start and end indices of the factors in X
+# - R[,1]: start indices
+# - R[,2]: end indices
+# Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X;
+# in this case the start and end indices corresponding to the baseline level need to be the same;
+# if R is not provided by default all variables are considered to be continuous
+# M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted
+# Cox model (the betas), their standard errors, confidence intervals, and P-values
+# S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including
+# no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare;
+# by default is standard output
+# T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model;
+# by default is standard output
+# COV String --- Location to store the variance-covariance matrix of the betas
+# RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X
+# XO String --- Location to store sorted input matrix by the timestamps
+# MF String --- Location to store column indices of X excluding the baseline factors if available
+# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas
+# tol Double 0.000001 Tolerance ("epsilon")
+# moi Int 100 Max. number of outer (Newton) iterations
+# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max
+# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only)
+# ---------------------------------------------------------------------------------------------
+# OUTPUT:
+# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema:
+# M[,1]: betas
+# M[,2]: exp(betas)
+# M[,3]: standard error of betas
+# M[,4]: Z
+# M[,5]: P-value
+# M[,6]: lower 100*(1-alpha)% confidence interval of betas
+# M[,7]: upper 100*(1-alpha)% confidence interval of betas
+#
+# Two log files containing a summary of some statistics of the fitted model:
+# 1- File S with the following format
+# - line 1: no. of observations
+# - line 2: no. of events
+# - line 3: log-likelihood
+# - line 4: AIC
+# - line 5: Rsquare (Cox & Snell)
+# - line 6: max possible Rsquare
+# 2- File T with the following format
+# - line 1: Likelihood ratio test statistic, degree of freedom, P-value
+# - line 2: Wald test statistic, degree of freedom, P-value
+# - line 3: Score (log-rank) test statistic, degree of freedom, P-value
+#
+# Additionally, the following matrices are stored (needed for prediction)
+# 1- A column matrix RT that contains the order-preserving recoded timestamps from X
+# 2- Matrix XO which is matrix X with sorted timestamps
+# 3- Variance-covariance matrix of the betas COV
+# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available)
+# -------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R
+# M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT
+# XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv
+
+fileX = $X;
+fileTE = $TE;
+fileRT = $RT;
+fileMF = $MF;
+fileM = $M;
+fileXO = $XO;
+fileCOV = $COV;
+
+# Default values of some parameters
+fileF = ifdef ($F, " "); # $F=" "
+fileR = ifdef ($R, " "); # $R=" "
+fileS = ifdef ($S, " "); # $S=" "
+fileT = ifdef ($T, " "); # $T=" "
+fmtO = ifdef ($fmt, "text"); # $fmt="text"
+alpha = ifdef ($alpha, 0.05); # $alpha=0.05
+tol = ifdef ($tol, 0.000001); # $tol=0.000001;
+maxiter = ifdef ($moi, 100); # $moi=100;
+maxinneriter = ifdef ($mii, 0); # $mii=0;
+
+X_orig = read (fileX);
+
+TE = read (fileTE);
+if (fileF != " ") {
+ F = read (fileF);
+}
+
+######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET
+
+if (fileR != " ") { # factors available
+ R = read (fileR);
+ if (ncol (R) != 2) {
+ stop ("Matrix R has wrong dimensions!");
+ }
+ print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");
+ # identify baseline columns to be removed from X_orig
+ col_sum = colSums (X_orig);
+ col_seq = t (seq(1, ncol (X_orig)));
+ parfor (i in 1:nrow (R), check = 0) {
+ start_ind = as.scalar (R[i,1]);
+ end_ind = as.scalar (R[i,2]);
+ baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1;
+ col_seq[,baseline_ind] = 0;
+ }
+ ones = matrix (1, rows = nrow (F), cols = 1);
+ F_filter = table (ones, F, 1, ncol (X_orig));
+ F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
+ TE_F = t(append (t (TE), F_filter));
+} else if (fileF != " ") { # all features scale
+ TE_F = t(append (t (TE), t(F)));
+} else { # no features available
+ TE_F = TE;
+}
+
+write (TE_F, fileMF, format = fmtO);
+
+X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F));
+
+######## RECODING TIMESTAMPS PRESERVING THE ORDER
+print ("RECODING TIMESTAMPS...");
+
+N = nrow (X_orig);
+X_orig = order (target = X_orig, by = 1);
+Idx = matrix (1, rows = N, cols = 1);
+num_timestamps = 1;
+if (N == 1) {
+ RT = matrix (1, rows = 1, cols = 1);
+} else {
+ Idx[2:N,1] = ppred (X_orig[1:(N - 1),1], X_orig[2:N,1], "!=");
+ num_timestamps = sum (Idx);
+ A = removeEmpty (target = diag (Idx), margin = "cols");
+ if (ncol (A) > 1) {
+ A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
+ B = cumsum (A);
+ RT = B %*% seq(1, ncol(B));
+ } else { # there is only one group
+ RT = matrix (1, rows = N, cols = 1);
+ }
+}
+E = X_orig[,2];
+
+print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
+
+######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
+# b: the regression betas
+# o: loss function value
+# g: loss function gradient
+# H: loss function Hessian
+# sb: shift of b in one iteration
+# so: shift of o in one iteration
+# r: CG residual = H %*% sb + g
+# d: CG direction vector
+# Hd: = H %*% d
+# c: scalar coefficient in CG
+# delta: trust region size
+# tol: tolerance value
+# i: outer (Newton) iteration count
+# j: inner (CG) iteration count
+
+# computing initial coefficients b (all initialized to 0)
+if (ncol (X_orig) > 2) {
+ X = X_orig[,3:ncol(X_orig)];
+ D = ncol (X);
+ zeros_D = matrix (0, rows = D, cols = 1);
+ b = zeros_D;
+}
+d_r = aggregate (target = E, groups = RT, fn = "sum");
+e_r = aggregate (target = RT, groups = RT, fn = "count");
+
+# computing initial loss function value o
+num_distinct = nrow (d_r); # no. of distinct timestamps
+e_r_rev_agg = cumsum (rev(e_r));
+d_r_rev = rev(d_r);
+o = sum (d_r_rev * log (e_r_rev_agg));
+o_init = o;
+if (ncol (X_orig) < 3) {
+ loglik = -o;
+ S_str = "no. of records " + N + " loglik " + loglik;
+ if (fileS != " ") {
+ write (S_str, fileS, format = fmtO);
+ } else {
+ print (S_str);
+ }
+ stop ("No features are selected!");
+}
+
+# computing initial gradient g
+# part 1 g0_1
+g0_1 = - t (colSums (X * E)); # g_1
+# part 2 g0_2
+X_rev_agg = cumsum (rev(X));
+select = table (seq (1, num_distinct), e_r_rev_agg);
+X_agg = select %*% X_rev_agg;
+g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
+#
+g0 = g0_1 + g0_2;
+g = g0;
+
+# initialization for trust region Newton method
+delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2)));
+initial_g2 = sum (g ^ 2);
+exit_g2 = initial_g2 * tol ^ 2;
+maxiter = 100;
+maxinneriter = min (D, 100);
+i = 0;
+sum_g2 = sum (g ^ 2);
+while (sum_g2 > exit_g2 & i < maxiter) {
+ i = i + 1;
+ sb = zeros_D;
+ r = g;
+ r2 = sum (r ^ 2);
+ exit_r2 = 0.01 * r2;
+ d = - r;
+ trust_bound_reached = FALSE;
+ j = 0;
+
+ exp_Xb = exp (X %*% b);
+ exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
+ D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
+ X_exp_Xb = X * exp_Xb;
+ X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+ X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+
+ while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) {
+ j = j + 1;
+ # computing Hessian times d (Hd)
+ # part 1 Hd_1
+ Xd = X %*% d;
+ X_Xd_exp_Xb = X * (Xd * exp_Xb);
+ X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
+ X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
+
+ Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
+ # part 2 Hd_2
+
+ Xd_exp_Xb = Xd * exp_Xb;
+ Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
+ Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
+
+ Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
+ Hd_2 = Hd_2_num / (D_r_rev ^ 2);
+
+ Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
+
+ c = r2 / sum (d * Hd);
+ [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2);
+ sb = sb + c * d;
+ r = r + c * Hd;
+ r2_new = sum (r ^ 2);
+ d = - r + (r2_new / r2) * d;
+ r2 = r2_new;
+ }
+
+ # computing loss change in 1 iteration (so)
+ # part 1 so_1
+ so_1 = - as.scalar (colSums (X * E) %*% (b + sb));
+ # part 2 so_2
+ exp_Xbsb = exp (X %*% (b + sb));
+ exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
+ so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
+ #
+ so = so_1 + so_2;
+ so = so - o;
+
+ delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g)));
+ if (so < 0) {
+ b = b + sb;
+ o = o + so;
+ # compute new gradient g
+ exp_Xb = exp (X %*% b);
+ exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
+ X_exp_Xb = X * exp_Xb;
+ X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+ X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+
+ D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
+ g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
+ g = g0_1 + g_2;
+ sum_g2 = sum (g ^ 2);
+ }
+}
+
+if (sum_g2 > exit_g2 & i >= maxiter) {
+ print ("Trust region Newton method did not converge!");
+}
+
+
+print ("COMPUTING HESSIAN...");
+
+H0 = matrix (0, rows = D, cols = D);
+H = matrix (0, rows = D, cols = D);
+
+X_exp_Xb_rev_2 = rev(X_exp_Xb);
+X_rev_2 = rev(X);
+
+X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+
+parfor (i in 1:D, check = 0) {
+ Xi = X[,i];
+ Xi_rev = rev(Xi);
+
+ ## ----------Start calculating H0--------------
+ # part 1 H0_1
+ Xi_X = X_rev_2[,i:D] * Xi_rev;
+ Xi_X_rev_agg = cumsum (Xi_X);
+ Xi_X_rev_agg = select %*% Xi_X_rev_agg;
+ H0_1 = Xi_X_rev_agg / e_r_rev_agg;
+
+ # part 2 H0_2
+ Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
+ Xi_agg_rev_agg = cumsum (rev(Xi_agg));
+ H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
+ H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
+
+ H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
+ #-----------End calculating H0--------------------
+
+ ## ----------Start calculating H--------------
+ # part 1 H_1
+ Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
+ Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
+ Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
+ H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
+
+ # part 2 H_2
+ Xi_exp_Xb = exp_Xb * Xi;
+ Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum");
+
+ Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
+ H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
+ H_2 = H_2_num / (D_r_rev ^ 2);
+ H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
+ #-----------End calculating H--------------------
+}
+H = H + t(H) - diag( diag (H));
+H0 = H0 + t(H0) - diag( diag (H0));
+
+
+# compute standard error for betas
+H_inv = inv (H);
+se_b = sqrt (diag (H_inv));
+
+# compute exp(b), Z, Pr[>|Z|]
+exp_b = exp (b);
+Z = b / se_b;
+P = matrix (0, rows = D, cols = 1);
+parfor (i in 1:D) {
+ P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal"));
+}
+
+# compute confidence intervals for b
+z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
+CI_l = b - se_b * z_alpha_2;
+CI_r = b - se_b + z_alpha_2;
+
+######## SOME STATISTICS AND TESTS
+# no. of records
+S_str = "no. of records " + N;
+
+# no.of events
+S_str = append (S_str, "no. of events " + sum (E));
+
+# log-likelihood
+loglik = -o;
+S_str = append (S_str, "loglik " + loglik + " ");
+
+# AIC = -2 * loglik + 2 * D
+AIC = -2 * loglik + 2 * D;
+S_str = append (S_str, "AIC " + AIC + " ");
+
+# Wald test
+wald_t = as.scalar (t(b) %*% H %*% b);
+wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
+T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
+
+# Likelihood ratio test
+lratio_t = 2 * o_init - 2 * o;
+lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
+T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " ");
+
+
+H0_inv = inv (H0);
+score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
+score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
+T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " ");
+
+# Rsquare (Cox & Snell)
+Rsquare = 1 - exp (-lratio_t / N);
+Rsquare_max = 1 - exp (-2 * o_init / N);
+S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
+S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
+
+M = matrix (0, rows = D, cols = 7);
+M[,1] = b;
+M[,2] = exp_b;
+M[,3] = se_b;
+M[,4] = Z;
+M[,5] = P;
+M[,6] = CI_l;
+M[,7] = CI_r;
+
+write (M, fileM, format = fmtO);
+if (fileS != " ") {
+ write (S_str, fileS, format = fmtO);
+} else {
+ print (S_str);
+}
+if (fileT != " ") {
+ write (T_str, fileT, format = fmtO);
+} else {
+ print (T_str);
+}
+# needed for prediction
+write (RT, fileRT, format = fmtO);
+write (H_inv, fileCOV, format = fmtO);
+write (X_orig, fileXO, format = fmtO);
+
+
+####### UDFS FOR TRUST REGION NEWTON METHOD
+
+ensure_trust_bound =
+ function (double x, double a, double b, double c)
+ return (double x_new, boolean is_violated)
+{
+ if (a * x^2 + b * x + c > 0)
+ {
+ is_violated = TRUE;
+ rad = sqrt (b ^ 2 - 4 * a * c);
+ if (b >= 0) {
+ x_new = - (2 * c) / (b + rad);
+ } else {
+ x_new = - (b - rad) / (2 * a);
+ }
+ } else {
+ is_violated = FALSE;
+ x_new = x;
+ }
+}
+
+update_trust_bound =
+ function (double delta,
+ double sb_distance,
+ double so_exact,
+ double so_linear_approx,
+ double so_quadratic_approx)
+ return (double delta)
+{
+ sigma1 = 0.25;
+ sigma2 = 0.5;
+ sigma3 = 4.0;
+
+ if (so_exact <= so_linear_approx) {
+ alpha = sigma3;
+ } else {
+ alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx));
+ }
+
+ rho = so_exact / so_quadratic_approx;
+ if (rho < 0.0001) {
+ delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
+ } else { if (rho < 0.25) {
+ delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta));
+ } else { if (rho < 0.75) {
+ delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta));
+ } else {
+ delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
+ }}}
+}