You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/22 17:34:23 UTC
[47/51] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/KM.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/KM.dml b/scripts/algorithms/KM.dml
index ae5d5dd..fbfa917 100644
--- a/scripts/algorithms/KM.dml
+++ b/scripts/algorithms/KM.dml
@@ -1,619 +1,619 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT ANALIZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------------------------
-# X String --- Location to read the input matrix X containing the survival data:
-# timestamps, whether event occurred (1) or data is censored (0), and a number of factors (categorical features)
-# for grouping and/or stratifying
-# TE String --- Column indices of X which contain timestamps (first entry) and event information (second entry)
-# GI String --- Column indices of X corresponding to the factors to be used for grouping
-# SI String --- Column indices of X corresponding to the factors to be used for stratifying
-# O String --- Location to write the matrix containing the results of the Kaplan-Meier analysis; see below for the description
-# M String --- Location to write Matrix M containing the following statistic: total number of events, median and its confidence intervals;
-# if survival data for multiple groups and strata are provided each row of M contains the above statistics per group and stratum
-# T String " " If survival data from multiple groups available and ttype=log-rank or wilcoxon,
-# location to write the matrix containing result of the (stratified) test for comparing multiple groups
-# alpha Double 0.05 Parameter to compute 100*(1-alpha)% confidence intervals for the survivor function and its median
-# etype String "greenwood" Parameter to specify the error type according to "greenwood" (the default) or "peto"
-# ctype String "log" Parameter to modify the confidence interval; "plain" keeps the lower and upper bound of
-# the confidence interval unmodified, "log" (the default) corresponds to logistic transformation and
-# "log-log" corresponds to the complementary log-log transformation
-# ttype String "none" If survival data for multiple groups is available specifies which test to perform for comparing
-# survival data across multiple groups: "none" (the default) "log-rank" or "wilcoxon" test
-# fmt String "text" The output format of results of the Kaplan-Meier analysis, such as "text" or "csv"
-# ---------------------------------------------------------------------------------------------
-# OUTPUT:
-# 1- Matrix KM whose dimension depends on the number of groups (denoted by g) and strata (denoted by s) in the data:
-# each collection of 7 consecutive columns in KM corresponds to a unique combination of groups and strata in the data with the following schema
-# 1. col: timestamp
-# 2. col: no. at risk
-# 3. col: no. of events
-# 4. col: Kaplan-Meier estimate of survivor function surv
-# 5. col: standard error of surv
-# 6. col: lower 100*(1-alpha)% confidence interval for surv
-# 7. col: upper 100*(1-alpha)% confidence interval for surv
-# 2- Matrix M whose dimension depends on the number of groups (g) and strata (s) in the data (k denotes the number of factors used for grouping
-# ,i.e., ncol(GI) and l denotes the number of factors used for stratifying, i.e., ncol(SI))
-# M[,1:k]: unique combination of values in the k factors used for grouping
-# M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
-# M[,k+l+1]: total number of records
-# M[,k+l+2]: total number of events
-# M[,k+l+3]: median of surv
-# M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
-# M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
-# If the number of groups and strata is equal to 1, M will have 4 columns with
-# M[,1]: total number of events
-# M[,2]: median of surv
-# M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
-# M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
-# 3- If survival data from multiple groups available and ttype=log-rank or wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
-# T_GROUPS_OE[,1] = no. of events
-# T_GROUPS_OE[,2] = observed value (O)
-# T_GROUPS_OE[,3] = expected value (E)
-# T_GROUPS_OE[,4] = (O-E)^2/E
-# T_GROUPS_OE[,5] = (O-E)^2/V
-# T[1,1] = no. of groups
-# T[1,2] = degree of freedom for Chi-squared distributed test statistic
-# T[1,3] = test statistic
-# T[1,4] = P-value
-# -------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f KM.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE GI=INPUT_DIR/GI SI=INPUT_DIR/SI O=OUTPUT_DIR/O
-# M=OUTPUT_DIR/M T=OUTPUT_DIR/T alpha=0.05 etype=greenwood ctype=log fmt=csv
-
-fileX = $X;
-fileTE = $TE;
-fileGI = ifdef ($GI, " ");
-fileSI = ifdef ($SI, " ");
-fileO = $O;
-fileM = $M;
-
-# Default values of some parameters
-fileT = ifdef ($T, " "); # $T=" "
-
-fileG = ifdef ($G, " "); # $G=" "
-fileS = ifdef ($S, " "); # $S=" "
-fmtO = ifdef ($fmt, "text"); # $fmt="text"
-alpha = ifdef ($alpha, 0.05); # $alpha=0.05
-err_type = ifdef ($etype, "greenwood"); # $etype="greenwood"
-conf_type = ifdef ($ctype, "log"); # $ctype="log"
-test_type = ifdef ($ttype, "none"); # $ttype="none"
-
-X = read (fileX);
-TE = read (fileTE);
-if (fileGI != " ") {
- GI = read (fileGI);
-} else {
- GI = matrix (0, rows = 1, cols = 1);
-}
-
-if (fileSI != " "){
- SI = read (fileSI);
-} else {
- SI = matrix (0, rows = 1, cols = 1);
-}
-
-TE = t(TE);
-GI = t(GI);
-SI = t(SI);
-
-# check arguments for validity
-if (err_type != "greenwood" & err_type != "peto") {
- stop (err_type + " is not a valid error type!");
-}
-
-if (conf_type != "plain" & conf_type != "log" & conf_type != "log-log") {
- stop (conf_type + " is not a valid confidence type!");
-}
-
-if (test_type != "log-rank" & test_type != "wilcoxon" & test_type != "none") {
- stop (test_type + " is not a valid test type!");
-}
-
-n_group_cols = ncol (GI);
-n_stratum_cols = ncol (SI);
-
-# check GI and SI for validity
-GI_1_1 = as.scalar (GI[1,1]);
-SI_1_1 = as.scalar (SI[1,1]);
-if (n_group_cols == 1) {
- if (GI_1_1 == 0) { # no factors for grouping
- n_group_cols = 0;
- }
-} else if (GI_1_1 == 0) {
- stop ("Matrix GI contains zero entries!");
-}
-if (n_stratum_cols == 1) {
- if (SI_1_1 == 0) { # no factors for stratifying
- n_stratum_cols = 0;
- }
-} else if (SI_1_1 == 0) {
- stop ("Matrix SI contains zero entries!");
-}
-
-if (2 + n_group_cols + n_stratum_cols > ncol (X)) {
- stop ("X has an incorrect number of columns!");
-}
-
-# reorder cols of X
-if (GI_1_1 == 0 & SI_1_1 == 0) {
- Is = TE;
-} else if (GI_1_1 == 0) {
- Is = append (TE, SI);
-} else if (SI_1_1 == 0) {
- Is = append (TE, GI);
-} else {
- Is = append (TE, append (GI, SI));
-}
-X = X %*% table (Is, seq (1, 2 + n_group_cols + n_stratum_cols), ncol (X), 2 + n_group_cols + n_stratum_cols);
-
-num_records = nrow (X);
-num_groups = 1;
-num_strata = 1;
-
-### compute group id for each record
-print ("Perform grouping...");
-if (n_group_cols > 0) {
- for (g in 1:n_group_cols) { # sort columns corresponding to groups sequentially
- X = order (target = X, by = 2 + g);
- }
- XG = X[,3:(3 + n_group_cols - 1)];
- Idx = matrix (1, rows = num_records, cols = 1);
- Idx[2:num_records,] = rowMaxs (ppred (X[1:(num_records - 1),3:(2 + n_group_cols)], X[2:num_records,3:(2 + n_group_cols)], "!="));
- num_groups = sum (Idx);
-
- XG = replace (target = XG, pattern = 0, replacement = "Infinity");
- XG = XG * Idx;
- XG = replace (target = XG, pattern = "NaN", replacement = 0);
- G_cols = removeEmpty (target = XG, margin = "rows");
- G_cols = replace (target = G_cols, pattern = "Infinity", replacement = 0);
-
- A = removeEmpty (target = diag (Idx), margin = "cols");
- if (ncol (A) > 1) {
- A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
- B = cumsum (A);
- Gi = B %*% seq(1, ncol(B)); # group ids
- } else { # there is only one group
- Gi = matrix (1, rows = num_records, cols = 1);
- }
- if (n_stratum_cols > 0) {
- X = append (append (X[,1:2],Gi), X[,(3 + g):ncol (X)]);
- } else { # no strata
- X = append (X[,1:2],Gi);
- }
-}
-
-### compute stratum id for each record
-print ("Perform stratifying...");
-if (n_stratum_cols > 0) {
- s_offset = 2;
- if (n_group_cols > 0) {
- s_offset = 3;
- }
- for (s in 1:n_stratum_cols) { # sort columns corresponding to strata sequentially
- X = order (target = X, by = s_offset + s);
- }
- XS = X[,(s_offset + 1):(s_offset + n_stratum_cols)];
- Idx = matrix (1, rows = num_records, cols = 1);
- Idx[2:num_records,] = rowMaxs (ppred (X[1:(num_records - 1),(s_offset + 1):(s_offset + n_stratum_cols)], X[2:num_records,(s_offset + 1):(s_offset + n_stratum_cols)], "!="));
- num_strata = sum (Idx);
-
- XS = replace (target = XS, pattern = 0, replacement = "Infinity");
- XS = XS * Idx;
- XS = replace (target = XS, pattern = "NaN", replacement = 0);
- S_cols = removeEmpty (target = XS, margin = "rows");
- S_cols = replace (target = S_cols, pattern = "Infinity", replacement = 0);
-
- SB = removeEmpty (target = seq (1,num_records), margin = "rows", select = Idx); # indices of stratum boundaries
- A = removeEmpty (target = diag (Idx), margin = "cols");
- if (ncol (A) > 1) {
- A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
- B = cumsum (A);
- Si = B %*% seq(1, ncol(B)); # stratum ids
- } else { # there is only one stratum
- Si = matrix (1, rows = num_records, cols = 1);
- }
- X = append (X[,1:3],Si);
-}
-
-if (n_group_cols == 0 & n_stratum_cols == 0) {
- X = append (X, matrix (1, rows = num_records, cols = 2));
- SB = matrix (1, rows = 1, cols = 1);
-} else if (n_group_cols == 0) {
- X = append (X[,1:2], append (matrix (1, rows = num_records, cols = 1), X[,3]));
-} else if (n_stratum_cols == 0) {
- X = append (X, matrix (1, rows = num_records, cols = 1));
- SB = matrix (1, rows = 1, cols = 1);
-}
-
-######## BEGIN KAPLAN-MEIER ANALYSIS
-print ("BEGIN KAPLAN-MEIER SURVIVAL FIT SCRIPT");
-
-KM = matrix (0, rows = num_records, cols = num_groups * num_strata * 7);
-KM_cols_select = matrix (1, rows = num_groups * num_strata * 7, cols = 1);
-GSI = matrix (0, rows = num_groups * num_strata, cols = 2);
-a = 1/0;
-M = matrix (a, rows = num_groups * num_strata, cols = 5);
-M_cols = seq (1, num_groups * num_strata);
-z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
-
-if (num_groups > 1 & test_type != "none") {
- str = "";
- TEST = matrix (0, rows = num_groups, cols = 5);
- TEST_GROUPS_OE = matrix (0, rows = 1, cols = 4);
- U = matrix (0, rows = num_groups, cols = num_strata);
- U_OE = matrix (0, rows = num_groups, cols = num_strata);
- OBS = matrix (0, rows = num_groups, cols = num_strata);
- EXP = matrix (0, rows = num_groups, cols = num_strata);
- V_sum_total = matrix (0, rows = num_groups-1, cols = (num_groups-1) * num_strata);
- n_event_all_global = matrix(1, rows=num_groups, cols=num_strata);
-} else if (num_groups == 1 & test_type != "none") {
- stop ("Data contains only one group or no groups, at least two groups are required for test!");
-}
-
-parfor (s in 1:num_strata, check = 0) {
-
- start_ind = as.scalar (SB[s,]);
- end_ind = num_records;
- if (s != num_strata) {
- end_ind = as.scalar (SB[s + 1,]) - 1;
- }
-
- ######## RECODING TIMESTAMPS PRESERVING THE ORDER
-
- X_cur = X[start_ind:end_ind,];
- range = end_ind - start_ind + 1;
- X_cur = order (target = X_cur, by = 1);
- Idx1 = matrix (1, rows = range, cols = 1);
-
- num_timestamps = 1;
- if (range == 1) {
- RT = matrix (1, rows = 1, cols = 1);
- } else {
- Idx1[2:range,1] = ppred (X_cur[1:(range - 1),1], X_cur[2:range,1], "!=");
- num_timestamps = sum (Idx1);
- A1 = removeEmpty (target = diag (Idx1), margin = "cols");
- if (ncol (A1) > 1) {
- A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
- B1 = cumsum (A1);
- RT = B1 %*% seq(1, ncol(B1));
- } else { # there is only one group
- RT = matrix (1, rows = range, cols = 1);
- }
- }
-
- T = X_cur[,1];
- E = X_cur[,2];
- G = X_cur[,3];
- S = X_cur[,4];
-
- n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no. of uncensored events per stratum
- n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); # no. both censored and uncensored of events per stratum
- Idx1 = cumsum (n_event_all_stratum);
- time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct timestamps both censored and uncensored per stratum
- time_stratum_has_zero = sum (ppred (time_stratum, 0, "==")) > 0;
- if (time_stratum_has_zero) {
- time_stratum = replace (target = time_stratum, pattern = 0, replacement = "Infinity");
- }
- n_time_all1 = nrow (n_event_stratum); # no. of distinct timestamps both censored and uncensored per stratum
- n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1);
- if (n_time_all1 > 1) {
- n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),];
- }
- n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum
-
- if (num_groups > 1 & test_type != "none") { # needed for log-rank or wilcoxon test
- n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols = num_groups * 2);
- }
-
- parfor (g in 1:num_groups, check = 0) {
-
- group_ind = ppred (G, g, "==");
- KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
- M_offset = (s - 1) * num_groups + g;
- if (sum (group_ind) != 0) { # group g is present in the stratum s
-
- GSI_offset = (s - 1) * num_groups + g;
- GSI[GSI_offset,1] = g;
- GSI[GSI_offset,2] = s;
- E_cur = E * group_ind;
-
- ######## COMPUTE NO. AT RISK AND NO.OF EVENTS FOR EACH TIMESTAMP
-
- n_event = aggregate (target = E_cur, groups = RT, fn = "sum"); # no. of uncensored events per stratum per group
- n_event_all = aggregate (target = group_ind, groups = RT, fn = "sum"); # no. of both censored and uncensored events per stratum per group
- Idx1 = cumsum (n_event_all);
- event_occurred = ppred (n_event, 0, ">");
- if (time_stratum_has_zero) {
- time = replace (target = time_stratum * event_occurred, pattern = "NaN", replacement = 0);
- time = removeEmpty (target = time, margin = "rows");
- time = replace (target = time, pattern = "Infinity", replacement = 0);
- } else {
- time = removeEmpty (target = time_stratum * event_occurred, margin = "rows");
- }
- n_time_all2 = nrow (n_event); # no. of distinct timestamps both censored and uncensored per stratum per group
- n_event_all_agg = matrix (0, rows = n_time_all2, cols = 1);
- if (n_time_all2 > 1) {
- n_event_all_agg[2:n_time_all2,] = Idx1[1:(n_time_all2 - 1),];
- }
-
- n_risk = sum (group_ind) - n_event_all_agg; # no. at risk per stratum per group
-
- if (num_groups > 1 & test_type != "none") {
- n_risk_n_event_stratum[,(g - 1) * 2 + 1] = n_risk;
- n_risk_n_event_stratum[,(g - 1) * 2 + 2] = n_event;
- }
-
- # Extract only rows corresponding to events, i.e., for which n_event is nonzero
- Idx1 = ppred (n_event, 0, "!=");
- KM_1 = matrix (0, rows = n_time_all2, cols = 2);
- KM_1[,1] = n_risk;
- KM_1[,2] = n_event;
- KM_1 = removeEmpty (target = KM_1, margin = "rows", select = Idx1);
- n_risk = KM_1[,1];
- n_event = KM_1[,2];
- n_time = nrow (time);
-
- ######## ESTIMATE SERVIVOR FUNCTION SURV, ITS STANDARD ERROR SE_SURV, AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
- surv = cumprod ((n_risk - n_event) / n_risk);
- tmp = n_event / (n_risk * (n_risk - n_event));
- se_surv = sqrt (cumsum (tmp)) * surv;
- if (err_type == "peto") {
- se_surv = (surv * sqrt(1 - surv) / sqrt(n_risk));
- }
-
- if (conf_type == "plain") {
- # True survivor function is in [surv +- z_alpha_2 * se_surv],
- # values less than 0 are replaced by 0, values larger than 1are replaced by 1!
- CI_l = max (surv - (z_alpha_2 * se_surv), 0);
- CI_r = min (surv + (z_alpha_2 * se_surv), 1);
- } else if (conf_type == "log") {
- # True survivor function is in [surv * exp(+- z_alpha_2 * se_surv / surv)]
- CI_l = max (surv * exp (- z_alpha_2 * se_surv / surv), 0);
- CI_r = min (surv * exp ( z_alpha_2 * se_surv / surv), 1);
- } else { # conf_type == "log-log"
- # True survivor function is in [surv ^ exp(+- z_alpha_2 * se(log(-log(surv))))]
- CI_l = max (surv ^ exp (- z_alpha_2 * se_surv / log(surv)), 0);
- CI_r = min (surv ^ exp ( z_alpha_2 * se_surv / log(surv)), 1);
- }
- #
- if (as.scalar (n_risk[n_time,]) == as.scalar (n_event[n_time,])) {
- CI_l[n_time,] = 0/0;
- CI_r[n_time,] = 0/0;
- }
-
- n_event_sum = sum (n_event);
- n_event_sum_all = sum(n_event_all);
- if (n_event_sum > 0) {
- # KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
- KM[1:n_time,KM_offset + 1] = time;
- KM[1:n_time,KM_offset + 2] = n_risk;
- KM[1:n_time,KM_offset + 3] = n_event;
- KM[1:n_time,KM_offset + 4] = surv;
- KM[1:n_time,KM_offset + 5] = se_surv;
- KM[1:n_time,KM_offset + 6] = CI_l;
- KM[1:n_time,KM_offset + 7] = CI_r;
- }
-
- ######## ESTIMATE MEDIAN OF SERVIVAL TIMES AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
-
- p_5 = ppred (surv, 0.5, "<=");
- pn_5 = sum (p_5);
- #M_offset = (s - 1) * num_groups + g;
- # if the estimated survivor function is larger than 0.5 for all timestamps median does not exist!
- p_5_exists = (pn_5 != 0);
- M[M_offset,2] = n_event_sum;
- M[M_offset,1] = n_event_sum_all;
- if (p_5_exists) {
- if ( as.scalar (surv[n_time - pn_5 + 1,1]) == 0.5 ) { # if the estimated survivor function is exactly equal to 0.5
- if (pn_5 > 1) {
- t_5 = as.scalar ((time[n_time - pn_5 + 1,1] + time[n_time - pn_5 + 2,1])/2);
- } else {
- t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
- }
- } else {
- t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
- }
-
- l_ind = ppred (CI_l, 0.5, "<=");
- r_ind = ppred (CI_r, 0.5, "<=");
- l_ind_sum = sum (l_ind);
- r_ind_sum = sum (r_ind);
- l_min_ind = as.scalar (rowIndexMin (t(l_ind)));
- r_min_ind = as.scalar (rowIndexMin (t(r_ind)));
- if (l_min_ind == n_time) {
- if (l_ind_sum > 0) {
- if (as.scalar (l_ind[n_time,1]) == 0) { # NA at last position
- M[M_offset,4] = time[n_time - l_ind_sum,1];
- } else {
- M[M_offset,4] = time[1,1];
- }
- }
- } else {
- M[M_offset,4] = time[l_min_ind + 1,1];
- }
- #
- if (r_min_ind == n_time) {
- if (r_ind_sum > 0) {
- if (as.scalar (r_ind[n_time,1]) == 0) { # NA at last position
- M[M_offset,5] = time[n_time - r_ind_sum,1];
- } else {
- M[M_offset,5] = time[1,1];
- }
- }
- } else {
- M[M_offset,5] = time[r_min_ind + 1,1];
- }
- M[M_offset,3] = t_5;
- if (test_type != "none"){
- n_event_all_global[g,s] = n_event_sum_all;
- }
- }
- } else {
- print ("group " + g + " is not present in the stratum " + s);
- KM_cols_select[(KM_offset + 1):(KM_offset + 7),1] = matrix (0, rows = 7, cols = 1);
- M_cols[M_offset,1] = 0;
- }
- }
-
-
- ######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON TEST
-
- if (num_groups > 1 & test_type != "none") {
-
- V = matrix (0, rows = num_groups-1, cols = num_groups-1);
- parfor (g in 0:(num_groups-1), check = 0) {
-
- n_risk = n_risk_n_event_stratum[,g * 2 + 1];
- n_event = n_risk_n_event_stratum[,g * 2 + 2];
-
- if (test_type == "log-rank") {
- O = n_event;
- E = n_risk * n_event_stratum / n_risk_stratum;
- } else { ### test_type == "wilcoxon"
- O = n_risk_stratum * n_event / range;
- E = n_risk * n_event_stratum / range;
- }
- U[(g + 1),s] = sum (O - E);
- U_OE[g + 1, s] = (sum (O - E)*sum (O - E))/sum(E);
- OBS[g + 1, s] = sum(O);
- EXP[g + 1, s] = sum(E);
- }
-
- # parfor (i1 in 0:(num_groups - 2), check = 0) {
- for (i1 in 0:(num_groups - 2), check = 0) {
-
- n_risk = n_risk_n_event_stratum[,1 + i1 * 2];
- n_event = n_risk_n_event_stratum[,2 + i1 * 2];
- for (i2 in 0:(num_groups - 2)) {
-
- n_risk_i2j = n_risk_n_event_stratum[,1 + i2 * 2];
- I_i1i2 = 0;
- if (i1 == i2) {
- I_i1i2 = 1;
- }
- if (test_type == "log-rank") {
- V1 = n_risk * n_event_stratum * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
- V1 = replace (target = V1, pattern = "NaN", replacement = 0);
- V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
- V[(i1 + 1),(i2 + 1)] = sum (V1 * V2);
- } else { ### test_type == "wilcoxon"
- V1 = (n_risk_stratum ^ 2) * (n_risk * n_event_stratum) * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
- V1 = replace (target = V1, pattern = "NaN", replacement = 0);
- V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
- V[(i1 + 1),(i2 + 1)] = sum (V1 * V2) / (range ^ 2);
- }
- }
- }
- V_start_ind = (s - 1) * (num_groups - 1) + 1;
- V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)] = V;
- }
-}
-
-if (num_groups > 1 & test_type != "none") {
- V_sum = matrix (0, rows = num_groups-1, cols = num_groups-1);
- for (s in 1:num_strata) {
- V_start_ind = (s - 1) * (num_groups - 1) + 1;
- V_sum_total_part = V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)];
- V_sum = V_sum + V_sum_total_part;
- }
-
- U_sum = rowSums (U);
-
- test_st = as.scalar (t(U_sum[1:(num_groups-1),1]) %*% inv(V_sum) %*% U_sum[1:(num_groups-1),1]);
- p_val = 1 - cdf (target = test_st, dist = "chisq", df = num_groups-1 );
- if (test_type != "none") {
- U_OE_sum = rowSums(U_OE);
- V_OE =rowSums((U*U) /sum(V_sum));
- TEST_GROUPS_OE[1,1] = num_groups;
- TEST_GROUPS_OE[1,2] = num_groups - 1;
- TEST_GROUPS_OE[1,3] = test_st;
- TEST_GROUPS_OE[1,4] = p_val;
- TEST[,1] = rowSums(n_event_all_global);
- TEST[,2] = rowSums(OBS);
- TEST[,3] = rowSums(EXP);
- TEST[,4] = rowSums(U_OE_sum);
- TEST[,5] = rowSums(V_OE);
- str = append (str, test_type + " test for " + num_groups + " groups: Chi-squared = " + test_st + " on " + (num_groups - 1) + " df, p = " + p_val + " ");
- }
-}
-
-GSI = removeEmpty (target = GSI, margin = "rows");
-if (n_group_cols > 0) {
- # making a copy of unique groups before adding new rows depending on strata
- G_cols_original = G_cols;
-
- GSI_1 = GSI[,1];
- tab_G = table (seq (1, nrow (GSI_1)), GSI_1, nrow (GSI_1), nrow (G_cols));
- G_cols = tab_G %*% G_cols;
-}
-
-if (n_stratum_cols > 0) {
- GSI_2 = GSI[,2];
- tab_S = table (seq (1, nrow (GSI_2)), GSI_2, nrow (GSI_2), nrow (S_cols));
- S_cols = tab_S %*% S_cols;
-}
-
-# pull out non-empty rows from M
-M_cols = removeEmpty (target = M_cols, margin = "rows");
-tab_M = table (seq (1, nrow (M_cols)), M_cols, nrow (M_cols), nrow (M));
-M = tab_M %*% M;
-M = replace (target = M, pattern = "Infinity", replacement = "NaN");
-
-# pull out non-empty rows from TEST
-if (n_group_cols > 0 & n_stratum_cols > 0) {
- M = append (append (G_cols, S_cols), M);
- if (test_type != "none") {
- TEST = append (G_cols_original, TEST);
- }
-} else if (n_group_cols > 0) {
- M = append (G_cols, M);
- if (test_type != "none") {
- TEST = append (G_cols_original, TEST);
- }
-} else if (n_stratum_cols > 0) {
- M = append (S_cols, M);
-}
-
-# pull out non-empty columns from KM
-KM = t (append (t (KM), KM_cols_select) * KM_cols_select);
-KM = removeEmpty (target = KM, margin = "cols");
-KM = removeEmpty (target = KM, margin = "rows");
-KM = KM[1:(nrow (KM) - 1),];
-
-# write output matrices
-write (M, fileM, format=fmtO);
-write (KM, fileO, format=fmtO);
-
-if (test_type != "none") {
- if (num_groups > 1 & fileT != " ") {
- write (TEST, fileT, format=fmtO);
- write (TEST_GROUPS_OE, fileT+".groups.oe", format=fmtO);
- } else {
- print (str);
- }
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT ANALIZES SURVIVAL DATA USING KAPLAN-MEIER ESTIMATES
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------------------------
+# X String --- Location to read the input matrix X containing the survival data:
+# timestamps, whether event occurred (1) or data is censored (0), and a number of factors (categorical features)
+# for grouping and/or stratifying
+# TE String --- Column indices of X which contain timestamps (first entry) and event information (second entry)
+# GI String --- Column indices of X corresponding to the factors to be used for grouping
+# SI String --- Column indices of X corresponding to the factors to be used for stratifying
+# O String --- Location to write the matrix containing the results of the Kaplan-Meier analysis; see below for the description
+# M String --- Location to write Matrix M containing the following statistic: total number of events, median and its confidence intervals;
+# if survival data for multiple groups and strata are provided each row of M contains the above statistics per group and stratum
+# T String " " If survival data from multiple groups available and ttype=log-rank or wilcoxon,
+# location to write the matrix containing result of the (stratified) test for comparing multiple groups
+# alpha Double 0.05 Parameter to compute 100*(1-alpha)% confidence intervals for the survivor function and its median
+# etype String "greenwood" Parameter to specify the error type according to "greenwood" (the default) or "peto"
+# ctype String "log" Parameter to modify the confidence interval; "plain" keeps the lower and upper bound of
+# the confidence interval unmodified, "log" (the default) corresponds to logistic transformation and
+# "log-log" corresponds to the complementary log-log transformation
+# ttype String "none" If survival data for multiple groups is available specifies which test to perform for comparing
+# survival data across multiple groups: "none" (the default) "log-rank" or "wilcoxon" test
+# fmt String "text" The output format of results of the Kaplan-Meier analysis, such as "text" or "csv"
+# ---------------------------------------------------------------------------------------------
+# OUTPUT:
+# 1- Matrix KM whose dimension depends on the number of groups (denoted by g) and strata (denoted by s) in the data:
+# each collection of 7 consecutive columns in KM corresponds to a unique combination of groups and strata in the data with the following schema
+# 1. col: timestamp
+# 2. col: no. at risk
+# 3. col: no. of events
+# 4. col: Kaplan-Meier estimate of survivor function surv
+# 5. col: standard error of surv
+# 6. col: lower 100*(1-alpha)% confidence interval for surv
+# 7. col: upper 100*(1-alpha)% confidence interval for surv
+# 2- Matrix M whose dimension depends on the number of groups (g) and strata (s) in the data (k denotes the number of factors used for grouping
+# ,i.e., ncol(GI) and l denotes the number of factors used for stratifying, i.e., ncol(SI))
+# M[,1:k]: unique combination of values in the k factors used for grouping
+# M[,(k+1):(k+l)]: unique combination of values in the l factors used for stratifying
+# M[,k+l+1]: total number of records
+# M[,k+l+2]: total number of events
+# M[,k+l+3]: median of surv
+# M[,k+l+4]: lower 100*(1-alpha)% confidence interval of the median of surv
+# M[,k+l+5]: upper 100*(1-alpha)% confidence interval of the median of surv
+# If the number of groups and strata is equal to 1, M will have 4 columns with
+# M[,1]: total number of events
+# M[,2]: median of surv
+# M[,3]: lower 100*(1-alpha)% confidence interval of the median of surv
+# M[,4]: upper 100*(1-alpha)% confidence interval of the median of surv
+# 3- If survival data from multiple groups available and ttype=log-rank or wilcoxon, a 1 x 4 matrix T and an g x 5 matrix T_GROUPS_OE with
+# T_GROUPS_OE[,1] = no. of events
+# T_GROUPS_OE[,2] = observed value (O)
+# T_GROUPS_OE[,3] = expected value (E)
+# T_GROUPS_OE[,4] = (O-E)^2/E
+# T_GROUPS_OE[,5] = (O-E)^2/V
+# T[1,1] = no. of groups
+# T[1,2] = degree of freedom for Chi-squared distributed test statistic
+# T[1,3] = test statistic
+# T[1,4] = P-value
+# -------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f KM.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE GI=INPUT_DIR/GI SI=INPUT_DIR/SI O=OUTPUT_DIR/O
+# M=OUTPUT_DIR/M T=OUTPUT_DIR/T alpha=0.05 etype=greenwood ctype=log fmt=csv
+
+fileX = $X;
+fileTE = $TE;
+fileGI = ifdef ($GI, " ");
+fileSI = ifdef ($SI, " ");
+fileO = $O;
+fileM = $M;
+
+# Default values of some parameters
+fileT = ifdef ($T, " "); # $T=" "
+
+fileG = ifdef ($G, " "); # $G=" "
+fileS = ifdef ($S, " "); # $S=" "
+fmtO = ifdef ($fmt, "text"); # $fmt="text"
+alpha = ifdef ($alpha, 0.05); # $alpha=0.05
+err_type = ifdef ($etype, "greenwood"); # $etype="greenwood"
+conf_type = ifdef ($ctype, "log"); # $ctype="log"
+test_type = ifdef ($ttype, "none"); # $ttype="none"
+
+X = read (fileX);
+TE = read (fileTE);
+if (fileGI != " ") {
+ GI = read (fileGI);
+} else {
+ GI = matrix (0, rows = 1, cols = 1);
+}
+
+if (fileSI != " "){
+ SI = read (fileSI);
+} else {
+ SI = matrix (0, rows = 1, cols = 1);
+}
+
+TE = t(TE);
+GI = t(GI);
+SI = t(SI);
+
+# check arguments for validity
+if (err_type != "greenwood" & err_type != "peto") {
+ stop (err_type + " is not a valid error type!");
+}
+
+if (conf_type != "plain" & conf_type != "log" & conf_type != "log-log") {
+ stop (conf_type + " is not a valid confidence type!");
+}
+
+if (test_type != "log-rank" & test_type != "wilcoxon" & test_type != "none") {
+ stop (test_type + " is not a valid test type!");
+}
+
+n_group_cols = ncol (GI);
+n_stratum_cols = ncol (SI);
+
+# check GI and SI for validity
+GI_1_1 = as.scalar (GI[1,1]);
+SI_1_1 = as.scalar (SI[1,1]);
+if (n_group_cols == 1) {
+ if (GI_1_1 == 0) { # no factors for grouping
+ n_group_cols = 0;
+ }
+} else if (GI_1_1 == 0) {
+ stop ("Matrix GI contains zero entries!");
+}
+if (n_stratum_cols == 1) {
+ if (SI_1_1 == 0) { # no factors for stratifying
+ n_stratum_cols = 0;
+ }
+} else if (SI_1_1 == 0) {
+ stop ("Matrix SI contains zero entries!");
+}
+
+if (2 + n_group_cols + n_stratum_cols > ncol (X)) {
+ stop ("X has an incorrect number of columns!");
+}
+
+# reorder cols of X
+if (GI_1_1 == 0 & SI_1_1 == 0) {
+ Is = TE;
+} else if (GI_1_1 == 0) {
+ Is = append (TE, SI);
+} else if (SI_1_1 == 0) {
+ Is = append (TE, GI);
+} else {
+ Is = append (TE, append (GI, SI));
+}
+X = X %*% table (Is, seq (1, 2 + n_group_cols + n_stratum_cols), ncol (X), 2 + n_group_cols + n_stratum_cols);
+
+num_records = nrow (X);
+num_groups = 1;
+num_strata = 1;
+
+### compute group id for each record
+print ("Perform grouping...");
+if (n_group_cols > 0) {
+ for (g in 1:n_group_cols) { # sort columns corresponding to groups sequentially
+ X = order (target = X, by = 2 + g);
+ }
+ XG = X[,3:(3 + n_group_cols - 1)];
+ Idx = matrix (1, rows = num_records, cols = 1);
+ Idx[2:num_records,] = rowMaxs (ppred (X[1:(num_records - 1),3:(2 + n_group_cols)], X[2:num_records,3:(2 + n_group_cols)], "!="));
+ num_groups = sum (Idx);
+
+ XG = replace (target = XG, pattern = 0, replacement = "Infinity");
+ XG = XG * Idx;
+ XG = replace (target = XG, pattern = "NaN", replacement = 0);
+ G_cols = removeEmpty (target = XG, margin = "rows");
+ G_cols = replace (target = G_cols, pattern = "Infinity", replacement = 0);
+
+ A = removeEmpty (target = diag (Idx), margin = "cols");
+ if (ncol (A) > 1) {
+ A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
+ B = cumsum (A);
+ Gi = B %*% seq(1, ncol(B)); # group ids
+ } else { # there is only one group
+ Gi = matrix (1, rows = num_records, cols = 1);
+ }
+ if (n_stratum_cols > 0) {
+ X = append (append (X[,1:2],Gi), X[,(3 + g):ncol (X)]);
+ } else { # no strata
+ X = append (X[,1:2],Gi);
+ }
+}
+
+### compute stratum id for each record
+print ("Perform stratifying...");
+if (n_stratum_cols > 0) {
+ s_offset = 2;
+ if (n_group_cols > 0) {
+ s_offset = 3;
+ }
+ for (s in 1:n_stratum_cols) { # sort columns corresponding to strata sequentially
+ X = order (target = X, by = s_offset + s);
+ }
+ XS = X[,(s_offset + 1):(s_offset + n_stratum_cols)];
+ Idx = matrix (1, rows = num_records, cols = 1);
+ Idx[2:num_records,] = rowMaxs (ppred (X[1:(num_records - 1),(s_offset + 1):(s_offset + n_stratum_cols)], X[2:num_records,(s_offset + 1):(s_offset + n_stratum_cols)], "!="));
+ num_strata = sum (Idx);
+
+ XS = replace (target = XS, pattern = 0, replacement = "Infinity");
+ XS = XS * Idx;
+ XS = replace (target = XS, pattern = "NaN", replacement = 0);
+ S_cols = removeEmpty (target = XS, margin = "rows");
+ S_cols = replace (target = S_cols, pattern = "Infinity", replacement = 0);
+
+ SB = removeEmpty (target = seq (1,num_records), margin = "rows", select = Idx); # indices of stratum boundaries
+ A = removeEmpty (target = diag (Idx), margin = "cols");
+ if (ncol (A) > 1) {
+ A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
+ B = cumsum (A);
+ Si = B %*% seq(1, ncol(B)); # stratum ids
+ } else { # there is only one stratum
+ Si = matrix (1, rows = num_records, cols = 1);
+ }
+ X = append (X[,1:3],Si);
+}
+
+if (n_group_cols == 0 & n_stratum_cols == 0) {
+ X = append (X, matrix (1, rows = num_records, cols = 2));
+ SB = matrix (1, rows = 1, cols = 1);
+} else if (n_group_cols == 0) {
+ X = append (X[,1:2], append (matrix (1, rows = num_records, cols = 1), X[,3]));
+} else if (n_stratum_cols == 0) {
+ X = append (X, matrix (1, rows = num_records, cols = 1));
+ SB = matrix (1, rows = 1, cols = 1);
+}
+
+######## BEGIN KAPLAN-MEIER ANALYSIS
+print ("BEGIN KAPLAN-MEIER SURVIVAL FIT SCRIPT");
+
+KM = matrix (0, rows = num_records, cols = num_groups * num_strata * 7);
+KM_cols_select = matrix (1, rows = num_groups * num_strata * 7, cols = 1);
+GSI = matrix (0, rows = num_groups * num_strata, cols = 2);
+a = 1/0;
+M = matrix (a, rows = num_groups * num_strata, cols = 5);
+M_cols = seq (1, num_groups * num_strata);
+z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
+
+if (num_groups > 1 & test_type != "none") {
+ str = "";
+ TEST = matrix (0, rows = num_groups, cols = 5);
+ TEST_GROUPS_OE = matrix (0, rows = 1, cols = 4);
+ U = matrix (0, rows = num_groups, cols = num_strata);
+ U_OE = matrix (0, rows = num_groups, cols = num_strata);
+ OBS = matrix (0, rows = num_groups, cols = num_strata);
+ EXP = matrix (0, rows = num_groups, cols = num_strata);
+ V_sum_total = matrix (0, rows = num_groups-1, cols = (num_groups-1) * num_strata);
+ n_event_all_global = matrix(1, rows=num_groups, cols=num_strata);
+} else if (num_groups == 1 & test_type != "none") {
+ stop ("Data contains only one group or no groups, at least two groups are required for test!");
+}
+
+parfor (s in 1:num_strata, check = 0) {
+
+ start_ind = as.scalar (SB[s,]);
+ end_ind = num_records;
+ if (s != num_strata) {
+ end_ind = as.scalar (SB[s + 1,]) - 1;
+ }
+
+ ######## RECODING TIMESTAMPS PRESERVING THE ORDER
+
+ X_cur = X[start_ind:end_ind,];
+ range = end_ind - start_ind + 1;
+ X_cur = order (target = X_cur, by = 1);
+ Idx1 = matrix (1, rows = range, cols = 1);
+
+ num_timestamps = 1;
+ if (range == 1) {
+ RT = matrix (1, rows = 1, cols = 1);
+ } else {
+ Idx1[2:range,1] = ppred (X_cur[1:(range - 1),1], X_cur[2:range,1], "!=");
+ num_timestamps = sum (Idx1);
+ A1 = removeEmpty (target = diag (Idx1), margin = "cols");
+ if (ncol (A1) > 1) {
+ A1[,1:(ncol (A1) - 1)] = A1[,1:(ncol (A1) - 1)] - A1[,2:ncol (A1)];
+ B1 = cumsum (A1);
+ RT = B1 %*% seq(1, ncol(B1));
+ } else { # there is only one group
+ RT = matrix (1, rows = range, cols = 1);
+ }
+ }
+
+ T = X_cur[,1];
+ E = X_cur[,2];
+ G = X_cur[,3];
+ S = X_cur[,4];
+
+ n_event_stratum = aggregate (target = E, groups = RT, fn = "sum"); # no. of uncensored events per stratum
+ n_event_all_stratum = aggregate (target = E, groups = RT, fn = "count"); # no. both censored and uncensored of events per stratum
+ Idx1 = cumsum (n_event_all_stratum);
+ time_stratum = table (seq (1, nrow (Idx1), 1), Idx1) %*% T; # distinct timestamps both censored and uncensored per stratum
+ time_stratum_has_zero = sum (ppred (time_stratum, 0, "==")) > 0;
+ if (time_stratum_has_zero) {
+ time_stratum = replace (target = time_stratum, pattern = 0, replacement = "Infinity");
+ }
+ n_time_all1 = nrow (n_event_stratum); # no. of distinct timestamps both censored and uncensored per stratum
+ n_event_all_stratum_agg = matrix (0, rows = n_time_all1, cols = 1);
+ if (n_time_all1 > 1) {
+ n_event_all_stratum_agg[2:n_time_all1,] = Idx1[1:(n_time_all1 - 1),];
+ }
+ n_risk_stratum = range - n_event_all_stratum_agg; # no. at risk per stratum
+
+ if (num_groups > 1 & test_type != "none") { # needed for log-rank or wilcoxon test
+ n_risk_n_event_stratum = matrix (0, rows = n_time_all1, cols = num_groups * 2);
+ }
+
+ parfor (g in 1:num_groups, check = 0) {
+
+ group_ind = ppred (G, g, "==");
+ KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
+ M_offset = (s - 1) * num_groups + g;
+ if (sum (group_ind) != 0) { # group g is present in the stratum s
+
+ GSI_offset = (s - 1) * num_groups + g;
+ GSI[GSI_offset,1] = g;
+ GSI[GSI_offset,2] = s;
+ E_cur = E * group_ind;
+
+ ######## COMPUTE NO. AT RISK AND NO.OF EVENTS FOR EACH TIMESTAMP
+
+ n_event = aggregate (target = E_cur, groups = RT, fn = "sum"); # no. of uncensored events per stratum per group
+ n_event_all = aggregate (target = group_ind, groups = RT, fn = "sum"); # no. of both censored and uncensored events per stratum per group
+ Idx1 = cumsum (n_event_all);
+ event_occurred = ppred (n_event, 0, ">");
+ if (time_stratum_has_zero) {
+ time = replace (target = time_stratum * event_occurred, pattern = "NaN", replacement = 0);
+ time = removeEmpty (target = time, margin = "rows");
+ time = replace (target = time, pattern = "Infinity", replacement = 0);
+ } else {
+ time = removeEmpty (target = time_stratum * event_occurred, margin = "rows");
+ }
+ n_time_all2 = nrow (n_event); # no. of distinct timestamps both censored and uncensored per stratum per group
+ n_event_all_agg = matrix (0, rows = n_time_all2, cols = 1);
+ if (n_time_all2 > 1) {
+ n_event_all_agg[2:n_time_all2,] = Idx1[1:(n_time_all2 - 1),];
+ }
+
+ n_risk = sum (group_ind) - n_event_all_agg; # no. at risk per stratum per group
+
+ if (num_groups > 1 & test_type != "none") {
+ n_risk_n_event_stratum[,(g - 1) * 2 + 1] = n_risk;
+ n_risk_n_event_stratum[,(g - 1) * 2 + 2] = n_event;
+ }
+
+ # Extract only rows corresponding to events, i.e., for which n_event is nonzero
+ Idx1 = ppred (n_event, 0, "!=");
+ KM_1 = matrix (0, rows = n_time_all2, cols = 2);
+ KM_1[,1] = n_risk;
+ KM_1[,2] = n_event;
+ KM_1 = removeEmpty (target = KM_1, margin = "rows", select = Idx1);
+ n_risk = KM_1[,1];
+ n_event = KM_1[,2];
+ n_time = nrow (time);
+
+ ######## ESTIMATE SERVIVOR FUNCTION SURV, ITS STANDARD ERROR SE_SURV, AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
+ surv = cumprod ((n_risk - n_event) / n_risk);
+ tmp = n_event / (n_risk * (n_risk - n_event));
+ se_surv = sqrt (cumsum (tmp)) * surv;
+ if (err_type == "peto") {
+ se_surv = (surv * sqrt(1 - surv) / sqrt(n_risk));
+ }
+
+ if (conf_type == "plain") {
+ # True survivor function is in [surv +- z_alpha_2 * se_surv],
+ # values less than 0 are replaced by 0, values larger than 1are replaced by 1!
+ CI_l = max (surv - (z_alpha_2 * se_surv), 0);
+ CI_r = min (surv + (z_alpha_2 * se_surv), 1);
+ } else if (conf_type == "log") {
+ # True survivor function is in [surv * exp(+- z_alpha_2 * se_surv / surv)]
+ CI_l = max (surv * exp (- z_alpha_2 * se_surv / surv), 0);
+ CI_r = min (surv * exp ( z_alpha_2 * se_surv / surv), 1);
+ } else { # conf_type == "log-log"
+ # True survivor function is in [surv ^ exp(+- z_alpha_2 * se(log(-log(surv))))]
+ CI_l = max (surv ^ exp (- z_alpha_2 * se_surv / log(surv)), 0);
+ CI_r = min (surv ^ exp ( z_alpha_2 * se_surv / log(surv)), 1);
+ }
+ #
+ if (as.scalar (n_risk[n_time,]) == as.scalar (n_event[n_time,])) {
+ CI_l[n_time,] = 0/0;
+ CI_r[n_time,] = 0/0;
+ }
+
+ n_event_sum = sum (n_event);
+ n_event_sum_all = sum(n_event_all);
+ if (n_event_sum > 0) {
+ # KM_offset = (s - 1) * num_groups * 7 + (g - 1) * 7;
+ KM[1:n_time,KM_offset + 1] = time;
+ KM[1:n_time,KM_offset + 2] = n_risk;
+ KM[1:n_time,KM_offset + 3] = n_event;
+ KM[1:n_time,KM_offset + 4] = surv;
+ KM[1:n_time,KM_offset + 5] = se_surv;
+ KM[1:n_time,KM_offset + 6] = CI_l;
+ KM[1:n_time,KM_offset + 7] = CI_r;
+ }
+
+ ######## ESTIMATE MEDIAN OF SERVIVAL TIMES AND ITS 100(1-ALPHA)% CONFIDENCE INTERVAL
+
+ p_5 = ppred (surv, 0.5, "<=");
+ pn_5 = sum (p_5);
+ #M_offset = (s - 1) * num_groups + g;
+ # if the estimated survivor function is larger than 0.5 for all timestamps median does not exist!
+ p_5_exists = (pn_5 != 0);
+ M[M_offset,2] = n_event_sum;
+ M[M_offset,1] = n_event_sum_all;
+ if (p_5_exists) {
+ if ( as.scalar (surv[n_time - pn_5 + 1,1]) == 0.5 ) { # if the estimated survivor function is exactly equal to 0.5
+ if (pn_5 > 1) {
+ t_5 = as.scalar ((time[n_time - pn_5 + 1,1] + time[n_time - pn_5 + 2,1])/2);
+ } else {
+ t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
+ }
+ } else {
+ t_5 = as.scalar (time[n_time - pn_5 + 1,1]);
+ }
+
+ l_ind = ppred (CI_l, 0.5, "<=");
+ r_ind = ppred (CI_r, 0.5, "<=");
+ l_ind_sum = sum (l_ind);
+ r_ind_sum = sum (r_ind);
+ l_min_ind = as.scalar (rowIndexMin (t(l_ind)));
+ r_min_ind = as.scalar (rowIndexMin (t(r_ind)));
+ if (l_min_ind == n_time) {
+ if (l_ind_sum > 0) {
+ if (as.scalar (l_ind[n_time,1]) == 0) { # NA at last position
+ M[M_offset,4] = time[n_time - l_ind_sum,1];
+ } else {
+ M[M_offset,4] = time[1,1];
+ }
+ }
+ } else {
+ M[M_offset,4] = time[l_min_ind + 1,1];
+ }
+ #
+ if (r_min_ind == n_time) {
+ if (r_ind_sum > 0) {
+ if (as.scalar (r_ind[n_time,1]) == 0) { # NA at last position
+ M[M_offset,5] = time[n_time - r_ind_sum,1];
+ } else {
+ M[M_offset,5] = time[1,1];
+ }
+ }
+ } else {
+ M[M_offset,5] = time[r_min_ind + 1,1];
+ }
+ M[M_offset,3] = t_5;
+ if (test_type != "none"){
+ n_event_all_global[g,s] = n_event_sum_all;
+ }
+ }
+ } else {
+ print ("group " + g + " is not present in the stratum " + s);
+ KM_cols_select[(KM_offset + 1):(KM_offset + 7),1] = matrix (0, rows = 7, cols = 1);
+ M_cols[M_offset,1] = 0;
+ }
+ }
+
+
+ ######## COMPARISON BETWEEN DIFFERENT GROUPS USING LOG-RANK OR WILCOXON TEST
+
+ if (num_groups > 1 & test_type != "none") {
+
+ V = matrix (0, rows = num_groups-1, cols = num_groups-1);
+ parfor (g in 0:(num_groups-1), check = 0) {
+
+ n_risk = n_risk_n_event_stratum[,g * 2 + 1];
+ n_event = n_risk_n_event_stratum[,g * 2 + 2];
+
+ if (test_type == "log-rank") {
+ O = n_event;
+ E = n_risk * n_event_stratum / n_risk_stratum;
+ } else { ### test_type == "wilcoxon"
+ O = n_risk_stratum * n_event / range;
+ E = n_risk * n_event_stratum / range;
+ }
+ U[(g + 1),s] = sum (O - E);
+ U_OE[g + 1, s] = (sum (O - E)*sum (O - E))/sum(E);
+ OBS[g + 1, s] = sum(O);
+ EXP[g + 1, s] = sum(E);
+ }
+
+ # parfor (i1 in 0:(num_groups - 2), check = 0) {
+ for (i1 in 0:(num_groups - 2), check = 0) {
+
+ n_risk = n_risk_n_event_stratum[,1 + i1 * 2];
+ n_event = n_risk_n_event_stratum[,2 + i1 * 2];
+ for (i2 in 0:(num_groups - 2)) {
+
+ n_risk_i2j = n_risk_n_event_stratum[,1 + i2 * 2];
+ I_i1i2 = 0;
+ if (i1 == i2) {
+ I_i1i2 = 1;
+ }
+ if (test_type == "log-rank") {
+ V1 = n_risk * n_event_stratum * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
+ V1 = replace (target = V1, pattern = "NaN", replacement = 0);
+ V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
+ V[(i1 + 1),(i2 + 1)] = sum (V1 * V2);
+ } else { ### test_type == "wilcoxon"
+ V1 = (n_risk_stratum ^ 2) * (n_risk * n_event_stratum) * (n_risk_stratum - n_event_stratum) / (n_risk_stratum * (n_risk_stratum - 1));
+ V1 = replace (target = V1, pattern = "NaN", replacement = 0);
+ V2 = I_i1i2 - (n_risk_i2j / n_risk_stratum);
+ V[(i1 + 1),(i2 + 1)] = sum (V1 * V2) / (range ^ 2);
+ }
+ }
+ }
+ V_start_ind = (s - 1) * (num_groups - 1) + 1;
+ V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)] = V;
+ }
+}
+
+if (num_groups > 1 & test_type != "none") {
+ V_sum = matrix (0, rows = num_groups-1, cols = num_groups-1);
+ for (s in 1:num_strata) {
+ V_start_ind = (s - 1) * (num_groups - 1) + 1;
+ V_sum_total_part = V_sum_total[,V_start_ind:(V_start_ind + num_groups - 2)];
+ V_sum = V_sum + V_sum_total_part;
+ }
+
+ U_sum = rowSums (U);
+
+ test_st = as.scalar (t(U_sum[1:(num_groups-1),1]) %*% inv(V_sum) %*% U_sum[1:(num_groups-1),1]);
+ p_val = 1 - cdf (target = test_st, dist = "chisq", df = num_groups-1 );
+ if (test_type != "none") {
+ U_OE_sum = rowSums(U_OE);
+ V_OE =rowSums((U*U) /sum(V_sum));
+ TEST_GROUPS_OE[1,1] = num_groups;
+ TEST_GROUPS_OE[1,2] = num_groups - 1;
+ TEST_GROUPS_OE[1,3] = test_st;
+ TEST_GROUPS_OE[1,4] = p_val;
+ TEST[,1] = rowSums(n_event_all_global);
+ TEST[,2] = rowSums(OBS);
+ TEST[,3] = rowSums(EXP);
+ TEST[,4] = rowSums(U_OE_sum);
+ TEST[,5] = rowSums(V_OE);
+ str = append (str, test_type + " test for " + num_groups + " groups: Chi-squared = " + test_st + " on " + (num_groups - 1) + " df, p = " + p_val + " ");
+ }
+}
+
+GSI = removeEmpty (target = GSI, margin = "rows");
+if (n_group_cols > 0) {
+ # making a copy of unique groups before adding new rows depending on strata
+ G_cols_original = G_cols;
+
+ GSI_1 = GSI[,1];
+ tab_G = table (seq (1, nrow (GSI_1)), GSI_1, nrow (GSI_1), nrow (G_cols));
+ G_cols = tab_G %*% G_cols;
+}
+
+if (n_stratum_cols > 0) {
+ GSI_2 = GSI[,2];
+ tab_S = table (seq (1, nrow (GSI_2)), GSI_2, nrow (GSI_2), nrow (S_cols));
+ S_cols = tab_S %*% S_cols;
+}
+
+# pull out non-empty rows from M
+M_cols = removeEmpty (target = M_cols, margin = "rows");
+tab_M = table (seq (1, nrow (M_cols)), M_cols, nrow (M_cols), nrow (M));
+M = tab_M %*% M;
+M = replace (target = M, pattern = "Infinity", replacement = "NaN");
+
+# pull out non-empty rows from TEST
+if (n_group_cols > 0 & n_stratum_cols > 0) {
+ M = append (append (G_cols, S_cols), M);
+ if (test_type != "none") {
+ TEST = append (G_cols_original, TEST);
+ }
+} else if (n_group_cols > 0) {
+ M = append (G_cols, M);
+ if (test_type != "none") {
+ TEST = append (G_cols_original, TEST);
+ }
+} else if (n_stratum_cols > 0) {
+ M = append (S_cols, M);
+}
+
+# pull out non-empty columns from KM
+KM = t (append (t (KM), KM_cols_select) * KM_cols_select);
+KM = removeEmpty (target = KM, margin = "cols");
+KM = removeEmpty (target = KM, margin = "rows");
+KM = KM[1:(nrow (KM) - 1),];
+
+# write output matrices
+write (M, fileM, format=fmtO);
+write (KM, fileO, format=fmtO);
+
+if (test_type != "none") {
+ if (num_groups > 1 & fileT != " ") {
+ write (TEST, fileT, format=fmtO);
+ write (TEST_GROUPS_OE, fileT+".groups.oe", format=fmtO);
+ } else {
+ print (str);
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Kmeans-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Kmeans-predict.dml b/scripts/algorithms/Kmeans-predict.dml
index 8496585..5bd78bd 100644
--- a/scripts/algorithms/Kmeans-predict.dml
+++ b/scripts/algorithms/Kmeans-predict.dml
@@ -1,339 +1,339 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Compares two categorical data vectors (presumed to be clusterings) and
-# counts matching/nonmatching same-cluster/different-cluster pairs of rows
-#
-# INPUT PARAMETERS:
-# ---------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ---------------------------------------------------------------------------
-# spY String " " Location to read a column-vector with the "specified"
-# assignment of records (rows) to categories (clusters)
-# prY String " " Location to read (or write, if X and C are present) a
-# column-vector with the "predicted" assignment of rows
-# to clusters. NOTE: The same category may be labeled
-# differently in each of the two vectors, spY and prY.
-# fmt String "text" Matrix output format for prY, usually "text" or "csv"
-# X String " " Location to read matrix X with the input data records
-# C String " " Location to read matrix C with the cluster centroids
-# NOTE: If X and C are present, prY is an output file.
-# O String " " Location to write the printed output statistics
-# ---------------------------------------------------------------------------
-#
-# The "O" file provides the output statistics in CSV format, one per line, in
-# the following format: NAME, [CID], VALUE. Note:
-# - The 1st group statistics are given if X input is available;
-# - The 2nd group statistics are given if X and C inputs are available;
-# - The 3rd and 4th group statistics are given if spY input is available;
-# - Only the 4th group statistics contain a nonempty CID value;
-# - When present, CID contains either the specified category label or the
-# predicted cluster label.
-#
-# NAME CID MEANING
-# ---------------------------------------------------------------------------
-# TSS Total Sum of Squares (from the total mean)
-# WCSS_M Within-Cluster Sum of Squares (means as centers)
-# WCSS_M_PC Within-Cluster Sum of Squares (means), in % of TSS
-# BCSS_M Between-Cluster Sum of Squares (means as centers)
-# BCSS_M_PC Between-Cluster Sum of Squares (means), in % of TSS
-#
-# WCSS_C Within-Cluster Sum of Squares (centroids as centers)
-# WCSS_C_PC Within-Cluster Sum of Squares (centroids), % of TSS
-# BCSS_C Between-Cluster Sum of Squares (centroids as centers)
-# BCSS_C_PC Between-Cluster Sum of Squares (centroids), % of TSS
-#
-# TRUE_SAME_CT Same-category pairs predicted as Same-cluster, count
-# TRUE_SAME_PC Same-category pairs predicted as Same-cluster, %
-# TRUE_DIFF_CT Diff-category pairs predicted as Diff-cluster, count
-# TRUE_DIFF_PC Diff-category pairs predicted as Diff-cluster, %
-# FALSE_SAME_CT Diff-category pairs predicted as Same-cluster, count
-# FALSE_SAME_PC Diff-category pairs predicted as Same-cluster, %
-# FALSE_DIFF_CT Same-category pairs predicted as Diff-cluster, count
-# FALSE_DIFF_PC Same-category pairs predicted as Diff-cluster, %
-#
-# SPEC_TO_PRED + For specified category, the best predicted cluster id
-# SPEC_FULL_CT + For specified category, its full count
-# SPEC_MATCH_CT + For specified category, best-cluster matching count
-# SPEC_MATCH_PC + For specified category, % of matching to full count
-# PRED_TO_SPEC + For predicted cluster, the best specified category id
-# PRED_FULL_CT + For predicted cluster, its full count
-# PRED_MATCH_CT + For predicted cluster, best-category matching count
-# PRED_MATCH_PC + For predicted cluster, % of matching to full count
-# ---------------------------------------------------------------------------
-#
-# Examples:
-# 1. To predict Y given X and C:
-# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
-# C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats
-# 2. To compare "actual" labels spY with "predicted" labels given X and C:
-# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
-# C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats
-# 3. To compare "actual" labels spY with given "predicted" labels prY:
-# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y
-# prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats
-
-
-fmt_prY = ifdef ($fmt, "text");
-filePrY = ifdef ($prY, " ");
-fileSpY = ifdef ($spY, " ");
-fileX = ifdef ($X, " ");
-fileC = ifdef ($C, " ");
-fileO = ifdef ($O, " ");
-
-is_str_empty = TRUE;
-str = " ";
-
-print ("BEGIN K-MEANS SCORING SCRIPT");
-
-if (fileX != " ") {
- print ("Reading X...");
- X = read (fileX);
- total_mean = colSums (X) / nrow (X);
- total_ss = sum( (X - total_mean)^2 );
-}
-
-if ((fileC != " ") & (fileX == " ")) {
- print ("ERROR: Cannot provide C without providing X.");
-} else {
-
-
-if (fileC != " ") {
- print ("Reading C...");
- C = read (fileC);
- num_clusters = nrow (C);
- ones_C = matrix (1, rows = num_clusters, cols = 1);
- print ("Computing the predicted Y...");
- D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
- prY = rowIndexMin (D);
- if (filePrY != " ") {
- print ("Writing the predicted Y...");
- write (prY, filePrY, format=fmt_prY);
- }
-} else {
- print ("Reading the predicted Y...");
- prY = read (filePrY);
- num_clusters = max (prY);
- ones_C = matrix (1, rows = num_clusters, cols = 1);
-}
-
-if (fileX != " ") {
- print ("Computing the WCSS...");
- # Compute projection matrix from clusters to records
- P = matrix (0, rows = nrow (X), cols = num_clusters);
- P [, 1 : max (prY)] = table (seq (1, nrow (X), 1), prY);
- # Compute the means, as opposed to the centroids
- cluster_sizes = t(colSums (P));
- record_of_ones = matrix (1, rows = 1, cols = ncol (X));
- M = (t(P) %*% X) / ((cluster_sizes + ppred (cluster_sizes, 0, "==")) %*% record_of_ones);
- # Compute the WCSS for the means
- wcss_means = sum ((X - P %*% M) ^ 2);
- wcss_means_pc = 100.0 * wcss_means / total_ss;
- bcss_means = sum (cluster_sizes * rowSums ((M - ones_C %*% total_mean) ^ 2));
- bcss_means_pc = 100.0 * bcss_means / total_ss;
- # Output results
- print ("Total Sum of Squares (TSS) = " + total_ss);
- print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means);
- print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means);
- str = "TSS,," + total_ss;
- str = append (str, "WCSS_M,," + wcss_means);
- str = append (str, "WCSS_M_PC,," + wcss_means_pc);
- str = append (str, "BCSS_M,," + bcss_means);
- str = append (str, "BCSS_M_PC,," + bcss_means_pc);
- is_str_empty = FALSE;
-}
-
-if (fileC != " ") {
- # Compute the WCSS for the centroids
- wcss_centroids = sum ((X - P %*% C) ^ 2);
- wcss_centroids_pc = 100.0 * wcss_centroids / total_ss;
- bcss_centroids = sum (cluster_sizes * rowSums ((C - ones_C %*% total_mean) ^ 2));
- bcss_centroids_pc = 100.0 * bcss_centroids / total_ss;
- # Output results
- print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids);
- print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids);
- str = append (str, "WCSS_C,," + wcss_centroids);
- str = append (str, "WCSS_C_PC,," + wcss_centroids_pc);
- str = append (str, "BCSS_C,," + bcss_centroids);
- str = append (str, "BCSS_C_PC,," + bcss_centroids_pc);
-}
-
-
-
-if (fileSpY != " ") {
-
-print ("Reading the specified Y...");
-spY = read (fileSpY);
-num_records = nrow (spY);
-
-if (num_records != nrow (prY) | ncol (spY) != 1 | ncol (prY) != 1) {
- print ("ERROR: spY and/or prY size mismatch");
- print ("nrow (spY) = " + nrow (spY) + "; ncol (spY) = " + ncol (spY)
- + "; nrow (prY) = " + nrow (prY) + "; ncol (prY) = " + ncol (prY));
-} else {
-
- print ("Computing the pairs counts...");
-
- orig_min_spY = min (spY);
- orig_min_prY = min (prY);
- spY = spY + (1 - orig_min_spY);
- prY = prY + (1 - orig_min_prY);
-
- spYprY_row_counts = table (spY, prY);
- spY_row_counts = rowSums (spYprY_row_counts);
- prY_row_counts = t(colSums (spYprY_row_counts));
-
- # Count all pairs of rows having the same (spY, prY)-values
- spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2;
-
- # Count all pairs of rows having the same spY-values
- spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2;
- # Count all pairs of rows having the same prY-values
- prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2;
-
- num_pairs = num_records * (num_records - 1.0) / 2.0;
-
- num_TP_pairs = sum (spYprY_pair_counts);
- num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs;
- num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs;
- num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs;
-
- pct_TP_pairs = num_TP_pairs / num_pairs * 100.0;
- pct_TN_pairs = num_TN_pairs / num_pairs * 100.0;
- pct_FP_pairs = num_FP_pairs / num_pairs * 100.0;
- pct_FN_pairs = num_FN_pairs / num_pairs * 100.0;
-
- if (is_str_empty) {
- str = "TRUE_SAME_CT,," + num_TP_pairs;
- is_str_empty = FALSE;
- } else {
- str = append (str, "TRUE_SAME_CT,," + num_TP_pairs);
- }
- str = append (str, "TRUE_SAME_PC,," + pct_TP_pairs);
- str = append (str, "TRUE_DIFF_CT,," + num_TN_pairs);
- str = append (str, "TRUE_DIFF_PC,," + pct_TN_pairs);
- str = append (str, "FALSE_SAME_CT,," + num_FP_pairs);
- str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs);
- str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs);
- str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs);
-
- pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0;
- pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0;
- pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0;
- pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0;
-
- space_TP = ""; if (pct_TP_pairs < 100) {space_TP = " ";} if (pct_TP_pairs < 10) {space_TP = " ";}
- space_TN = ""; if (pct_TN_pairs < 100) {space_TN = " ";} if (pct_TN_pairs < 10) {space_TN = " ";}
- space_FP = ""; if (pct_FP_pairs < 100) {space_FP = " ";} if (pct_FP_pairs < 10) {space_FP = " ";}
- space_FN = ""; if (pct_FN_pairs < 100) {space_FN = " ";} if (pct_FN_pairs < 10) {space_FN = " ";}
-
- print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP
- + pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")");
- print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN
- + pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")");
- print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP
- + pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")");
- print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN
- + pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")");
-
- [spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] =
- get_best_assignments (spYprY_row_counts);
-
- print (" ");
- print ("Specified Categories versus Predicted Clusters:");
-
- spY_cids = spY_cids + orig_min_spY - 1;
- prY_cids = prY_cids + orig_min_prY - 1;
-
- for (i in 1 : nrow (spY_cids))
- {
- cid = as.integer (castAsScalar (spY_cids [i, 1]));
- pct = castAsScalar (rounded_percentages [i, 1]);
- space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
- print ("Category " + cid +
- ": best pred. cluster is " + as.integer (castAsScalar (prY_cids [i, 1])) +
- "; full count = " + as.integer (castAsScalar (full_counts [i, 1])) +
- ", matching count = " + space_pct + pct + "% (" +
- as.integer (castAsScalar (matching_counts [i, 1])) + ")");
-
- str = append (str, "SPEC_TO_PRED," + cid + "," + castAsScalar (prY_cids [i, 1]));
- str = append (str, "SPEC_FULL_CT," + cid + "," + castAsScalar (full_counts [i, 1]));
- str = append (str, "SPEC_MATCH_CT," + cid + "," + castAsScalar (matching_counts [i, 1]));
- str = append (str, "SPEC_MATCH_PC," + cid + "," + castAsScalar (rounded_percentages [i, 1]));
- }
-
- [prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] =
- get_best_assignments (t(spYprY_row_counts));
-
- print (" ");
- print ("Predicted Clusters versus Specified Categories:");
-
- prY_cids = prY_cids + orig_min_prY - 1;
- spY_cids = spY_cids + orig_min_spY - 1;
-
- for (i in 1 : nrow (prY_cids))
- {
- cid = as.integer (castAsScalar (prY_cids [i, 1]));
- pct = castAsScalar (rounded_percentages [i, 1]);
- space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
- print ("Cluster " + cid +
- ": best spec. categ. is " + as.integer (castAsScalar (spY_cids [i, 1])) +
- "; full count = " + as.integer (castAsScalar (full_counts [i, 1])) +
- ", matching count = " + space_pct + pct + "% (" +
- as.integer (castAsScalar (matching_counts [i, 1])) + ")");
-
- str = append (str, "PRED_TO_SPEC," + cid + "," + castAsScalar (spY_cids [i, 1]));
- str = append (str, "PRED_FULL_CT," + cid + "," + castAsScalar (full_counts [i, 1]));
- str = append (str, "PRED_MATCH_CT," + cid + "," + castAsScalar (matching_counts [i, 1]));
- str = append (str, "PRED_MATCH_PC," + cid + "," + castAsScalar (rounded_percentages [i, 1]));
- }
-
- print (" ");
-}}}
-
-if ((fileO != " ") & (! is_str_empty)) {
- write (str, fileO);
-}
-
-print ("DONE: K-MEANS SCORING SCRIPT");
-
-
-
-get_best_assignments = function (Matrix[double] counts)
-return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins,
- Matrix[double] max_counts, Matrix[double] rounded_percentages)
-{
- margins = rowSums (counts);
- select_positive = diag (ppred (margins, 0, ">"));
- select_positive = removeEmpty (target = select_positive, margin = "rows");
- row_ids = select_positive %*% seq (1, nrow (margins), 1);
- pos_counts = select_positive %*% counts;
- pos_margins = select_positive %*% margins;
- max_counts = rowMaxs (pos_counts);
- one_per_column = matrix (1, rows = 1, cols = ncol (pos_counts));
- max_counts_ppred = max_counts %*% one_per_column;
- is_max_count = ppred (pos_counts, max_counts_ppred, "==");
- aggr_is_max_count = t(cumsum (t(is_max_count)));
- col_ids = rowSums (ppred (aggr_is_max_count, 0, "==")) + 1;
- rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0;
-}
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Compares two categorical data vectors (presumed to be clusterings) and
+# counts matching/nonmatching same-cluster/different-cluster pairs of rows
+#
+# INPUT PARAMETERS:
+# ---------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ---------------------------------------------------------------------------
+# spY String " " Location to read a column-vector with the "specified"
+# assignment of records (rows) to categories (clusters)
+# prY String " " Location to read (or write, if X and C are present) a
+# column-vector with the "predicted" assignment of rows
+# to clusters. NOTE: The same category may be labeled
+# differently in each of the two vectors, spY and prY.
+# fmt String "text" Matrix output format for prY, usually "text" or "csv"
+# X String " " Location to read matrix X with the input data records
+# C String " " Location to read matrix C with the cluster centroids
+# NOTE: If X and C are present, prY is an output file.
+# O String " " Location to write the printed output statistics
+# ---------------------------------------------------------------------------
+#
+# The "O" file provides the output statistics in CSV format, one per line, in
+# the following format: NAME, [CID], VALUE. Note:
+# - The 1st group statistics are given if X input is available;
+# - The 2nd group statistics are given if X and C inputs are available;
+# - The 3rd and 4th group statistics are given if spY input is available;
+# - Only the 4th group statistics contain a nonempty CID value;
+# - When present, CID contains either the specified category label or the
+# predicted cluster label.
+#
+# NAME CID MEANING
+# ---------------------------------------------------------------------------
+# TSS Total Sum of Squares (from the total mean)
+# WCSS_M Within-Cluster Sum of Squares (means as centers)
+# WCSS_M_PC Within-Cluster Sum of Squares (means), in % of TSS
+# BCSS_M Between-Cluster Sum of Squares (means as centers)
+# BCSS_M_PC Between-Cluster Sum of Squares (means), in % of TSS
+#
+# WCSS_C Within-Cluster Sum of Squares (centroids as centers)
+# WCSS_C_PC Within-Cluster Sum of Squares (centroids), % of TSS
+# BCSS_C Between-Cluster Sum of Squares (centroids as centers)
+# BCSS_C_PC Between-Cluster Sum of Squares (centroids), % of TSS
+#
+# TRUE_SAME_CT Same-category pairs predicted as Same-cluster, count
+# TRUE_SAME_PC Same-category pairs predicted as Same-cluster, %
+# TRUE_DIFF_CT Diff-category pairs predicted as Diff-cluster, count
+# TRUE_DIFF_PC Diff-category pairs predicted as Diff-cluster, %
+# FALSE_SAME_CT Diff-category pairs predicted as Same-cluster, count
+# FALSE_SAME_PC Diff-category pairs predicted as Same-cluster, %
+# FALSE_DIFF_CT Same-category pairs predicted as Diff-cluster, count
+# FALSE_DIFF_PC Same-category pairs predicted as Diff-cluster, %
+#
+# SPEC_TO_PRED + For specified category, the best predicted cluster id
+# SPEC_FULL_CT + For specified category, its full count
+# SPEC_MATCH_CT + For specified category, best-cluster matching count
+# SPEC_MATCH_PC + For specified category, % of matching to full count
+# PRED_TO_SPEC + For predicted cluster, the best specified category id
+# PRED_FULL_CT + For predicted cluster, its full count
+# PRED_MATCH_CT + For predicted cluster, best-category matching count
+# PRED_MATCH_PC + For predicted cluster, % of matching to full count
+# ---------------------------------------------------------------------------
+#
+# Examples:
+# 1. To predict Y given X and C:
+# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
+# C=INPUT_DIR/C prY=OUTPUT_DIR/PredY O=OUTPUT_DIR/stats
+# 2. To compare "actual" labels spY with "predicted" labels given X and C:
+# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs X=INPUT_DIR/X
+# C=INPUT_DIR/C spY=INPUT_DIR/Y O=OUTPUT_DIR/stats
+# 3. To compare "actual" labels spY with given "predicted" labels prY:
+# hadoop jar SystemML.jar -f Kmeans-predict.dml -nvargs spY=INPUT_DIR/Y
+# prY=INPUT_DIR/PredY O=OUTPUT_DIR/stats
+
+
+fmt_prY = ifdef ($fmt, "text");
+filePrY = ifdef ($prY, " ");
+fileSpY = ifdef ($spY, " ");
+fileX = ifdef ($X, " ");
+fileC = ifdef ($C, " ");
+fileO = ifdef ($O, " ");
+
+is_str_empty = TRUE;
+str = " ";
+
+print ("BEGIN K-MEANS SCORING SCRIPT");
+
+if (fileX != " ") {
+ print ("Reading X...");
+ X = read (fileX);
+ total_mean = colSums (X) / nrow (X);
+ total_ss = sum( (X - total_mean)^2 );
+}
+
+if ((fileC != " ") & (fileX == " ")) {
+ print ("ERROR: Cannot provide C without providing X.");
+} else {
+
+
+if (fileC != " ") {
+ print ("Reading C...");
+ C = read (fileC);
+ num_clusters = nrow (C);
+ ones_C = matrix (1, rows = num_clusters, cols = 1);
+ print ("Computing the predicted Y...");
+ D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
+ prY = rowIndexMin (D);
+ if (filePrY != " ") {
+ print ("Writing the predicted Y...");
+ write (prY, filePrY, format=fmt_prY);
+ }
+} else {
+ print ("Reading the predicted Y...");
+ prY = read (filePrY);
+ num_clusters = max (prY);
+ ones_C = matrix (1, rows = num_clusters, cols = 1);
+}
+
+if (fileX != " ") {
+ print ("Computing the WCSS...");
+ # Compute projection matrix from clusters to records
+ P = matrix (0, rows = nrow (X), cols = num_clusters);
+ P [, 1 : max (prY)] = table (seq (1, nrow (X), 1), prY);
+ # Compute the means, as opposed to the centroids
+ cluster_sizes = t(colSums (P));
+ record_of_ones = matrix (1, rows = 1, cols = ncol (X));
+ M = (t(P) %*% X) / ((cluster_sizes + ppred (cluster_sizes, 0, "==")) %*% record_of_ones);
+ # Compute the WCSS for the means
+ wcss_means = sum ((X - P %*% M) ^ 2);
+ wcss_means_pc = 100.0 * wcss_means / total_ss;
+ bcss_means = sum (cluster_sizes * rowSums ((M - ones_C %*% total_mean) ^ 2));
+ bcss_means_pc = 100.0 * bcss_means / total_ss;
+ # Output results
+ print ("Total Sum of Squares (TSS) = " + total_ss);
+ print ("WCSS for cluster means: " + (round (10000.0 * wcss_means_pc) / 10000.0) + "% of TSS = " + wcss_means);
+ print ("BCSS for cluster means: " + (round (10000.0 * bcss_means_pc) / 10000.0) + "% of TSS = " + bcss_means);
+ str = "TSS,," + total_ss;
+ str = append (str, "WCSS_M,," + wcss_means);
+ str = append (str, "WCSS_M_PC,," + wcss_means_pc);
+ str = append (str, "BCSS_M,," + bcss_means);
+ str = append (str, "BCSS_M_PC,," + bcss_means_pc);
+ is_str_empty = FALSE;
+}
+
+if (fileC != " ") {
+ # Compute the WCSS for the centroids
+ wcss_centroids = sum ((X - P %*% C) ^ 2);
+ wcss_centroids_pc = 100.0 * wcss_centroids / total_ss;
+ bcss_centroids = sum (cluster_sizes * rowSums ((C - ones_C %*% total_mean) ^ 2));
+ bcss_centroids_pc = 100.0 * bcss_centroids / total_ss;
+ # Output results
+ print ("WCSS for centroids: " + (round (10000.0 * wcss_centroids_pc) / 10000.0) + "% of TSS = " + wcss_centroids);
+ print ("BCSS for centroids: " + (round (10000.0 * bcss_centroids_pc) / 10000.0) + "% of TSS = " + bcss_centroids);
+ str = append (str, "WCSS_C,," + wcss_centroids);
+ str = append (str, "WCSS_C_PC,," + wcss_centroids_pc);
+ str = append (str, "BCSS_C,," + bcss_centroids);
+ str = append (str, "BCSS_C_PC,," + bcss_centroids_pc);
+}
+
+
+
+if (fileSpY != " ") {
+
+print ("Reading the specified Y...");
+spY = read (fileSpY);
+num_records = nrow (spY);
+
+if (num_records != nrow (prY) | ncol (spY) != 1 | ncol (prY) != 1) {
+ print ("ERROR: spY and/or prY size mismatch");
+ print ("nrow (spY) = " + nrow (spY) + "; ncol (spY) = " + ncol (spY)
+ + "; nrow (prY) = " + nrow (prY) + "; ncol (prY) = " + ncol (prY));
+} else {
+
+ print ("Computing the pairs counts...");
+
+ orig_min_spY = min (spY);
+ orig_min_prY = min (prY);
+ spY = spY + (1 - orig_min_spY);
+ prY = prY + (1 - orig_min_prY);
+
+ spYprY_row_counts = table (spY, prY);
+ spY_row_counts = rowSums (spYprY_row_counts);
+ prY_row_counts = t(colSums (spYprY_row_counts));
+
+ # Count all pairs of rows having the same (spY, prY)-values
+ spYprY_pair_counts = spYprY_row_counts * (spYprY_row_counts - 1) / 2;
+
+ # Count all pairs of rows having the same spY-values
+ spY_pair_counts = spY_row_counts * (spY_row_counts - 1) / 2;
+ # Count all pairs of rows having the same prY-values
+ prY_pair_counts = prY_row_counts * (prY_row_counts - 1) / 2;
+
+ num_pairs = num_records * (num_records - 1.0) / 2.0;
+
+ num_TP_pairs = sum (spYprY_pair_counts);
+ num_FP_pairs = sum (prY_pair_counts) - num_TP_pairs;
+ num_FN_pairs = sum (spY_pair_counts) - num_TP_pairs;
+ num_TN_pairs = num_pairs - num_TP_pairs - num_FP_pairs - num_FN_pairs;
+
+ pct_TP_pairs = num_TP_pairs / num_pairs * 100.0;
+ pct_TN_pairs = num_TN_pairs / num_pairs * 100.0;
+ pct_FP_pairs = num_FP_pairs / num_pairs * 100.0;
+ pct_FN_pairs = num_FN_pairs / num_pairs * 100.0;
+
+ if (is_str_empty) {
+ str = "TRUE_SAME_CT,," + num_TP_pairs;
+ is_str_empty = FALSE;
+ } else {
+ str = append (str, "TRUE_SAME_CT,," + num_TP_pairs);
+ }
+ str = append (str, "TRUE_SAME_PC,," + pct_TP_pairs);
+ str = append (str, "TRUE_DIFF_CT,," + num_TN_pairs);
+ str = append (str, "TRUE_DIFF_PC,," + pct_TN_pairs);
+ str = append (str, "FALSE_SAME_CT,," + num_FP_pairs);
+ str = append (str, "FALSE_SAME_PC,," + pct_FP_pairs);
+ str = append (str, "FALSE_DIFF_CT,," + num_FN_pairs);
+ str = append (str, "FALSE_DIFF_PC,," + pct_FN_pairs);
+
+ pct_TP_pairs = round (pct_TP_pairs * 10000.0) / 10000.0;
+ pct_TN_pairs = round (pct_TN_pairs * 10000.0) / 10000.0;
+ pct_FP_pairs = round (pct_FP_pairs * 10000.0) / 10000.0;
+ pct_FN_pairs = round (pct_FN_pairs * 10000.0) / 10000.0;
+
+ space_TP = ""; if (pct_TP_pairs < 100) {space_TP = " ";} if (pct_TP_pairs < 10) {space_TP = " ";}
+ space_TN = ""; if (pct_TN_pairs < 100) {space_TN = " ";} if (pct_TN_pairs < 10) {space_TN = " ";}
+ space_FP = ""; if (pct_FP_pairs < 100) {space_FP = " ";} if (pct_FP_pairs < 10) {space_FP = " ";}
+ space_FN = ""; if (pct_FN_pairs < 100) {space_FN = " ";} if (pct_FN_pairs < 10) {space_FN = " ";}
+
+ print ("Same-cluster pairs predicted as Same-cluster ( True Pos): " + space_TP
+ + pct_TP_pairs + "% of all pairs" + " (" + num_TP_pairs + ")");
+ print ("Diff-cluster pairs predicted as Diff-cluster ( True Neg): " + space_TN
+ + pct_TN_pairs + "% of all pairs" + " (" + num_TN_pairs + ")");
+ print ("Diff-cluster pairs predicted as Same-cluster (False Pos): " + space_FP
+ + pct_FP_pairs + "% of all pairs" + " (" + num_FP_pairs + ")");
+ print ("Same-cluster pairs predicted as Diff-cluster (False Neg): " + space_FN
+ + pct_FN_pairs + "% of all pairs" + " (" + num_FN_pairs + ")");
+
+ [spY_cids, prY_cids, full_counts, matching_counts, rounded_percentages] =
+ get_best_assignments (spYprY_row_counts);
+
+ print (" ");
+ print ("Specified Categories versus Predicted Clusters:");
+
+ spY_cids = spY_cids + orig_min_spY - 1;
+ prY_cids = prY_cids + orig_min_prY - 1;
+
+ for (i in 1 : nrow (spY_cids))
+ {
+ cid = as.integer (castAsScalar (spY_cids [i, 1]));
+ pct = castAsScalar (rounded_percentages [i, 1]);
+ space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
+ print ("Category " + cid +
+ ": best pred. cluster is " + as.integer (castAsScalar (prY_cids [i, 1])) +
+ "; full count = " + as.integer (castAsScalar (full_counts [i, 1])) +
+ ", matching count = " + space_pct + pct + "% (" +
+ as.integer (castAsScalar (matching_counts [i, 1])) + ")");
+
+ str = append (str, "SPEC_TO_PRED," + cid + "," + castAsScalar (prY_cids [i, 1]));
+ str = append (str, "SPEC_FULL_CT," + cid + "," + castAsScalar (full_counts [i, 1]));
+ str = append (str, "SPEC_MATCH_CT," + cid + "," + castAsScalar (matching_counts [i, 1]));
+ str = append (str, "SPEC_MATCH_PC," + cid + "," + castAsScalar (rounded_percentages [i, 1]));
+ }
+
+ [prY_cids, spY_cids, full_counts, matching_counts, rounded_percentages] =
+ get_best_assignments (t(spYprY_row_counts));
+
+ print (" ");
+ print ("Predicted Clusters versus Specified Categories:");
+
+ prY_cids = prY_cids + orig_min_prY - 1;
+ spY_cids = spY_cids + orig_min_spY - 1;
+
+ for (i in 1 : nrow (prY_cids))
+ {
+ cid = as.integer (castAsScalar (prY_cids [i, 1]));
+ pct = castAsScalar (rounded_percentages [i, 1]);
+ space_pct = ""; if (pct < 100) {space_pct = " ";} if (pct < 10) {space_pct = " ";}
+ print ("Cluster " + cid +
+ ": best spec. categ. is " + as.integer (castAsScalar (spY_cids [i, 1])) +
+ "; full count = " + as.integer (castAsScalar (full_counts [i, 1])) +
+ ", matching count = " + space_pct + pct + "% (" +
+ as.integer (castAsScalar (matching_counts [i, 1])) + ")");
+
+ str = append (str, "PRED_TO_SPEC," + cid + "," + castAsScalar (spY_cids [i, 1]));
+ str = append (str, "PRED_FULL_CT," + cid + "," + castAsScalar (full_counts [i, 1]));
+ str = append (str, "PRED_MATCH_CT," + cid + "," + castAsScalar (matching_counts [i, 1]));
+ str = append (str, "PRED_MATCH_PC," + cid + "," + castAsScalar (rounded_percentages [i, 1]));
+ }
+
+ print (" ");
+}}}
+
+if ((fileO != " ") & (! is_str_empty)) {
+ write (str, fileO);
+}
+
+print ("DONE: K-MEANS SCORING SCRIPT");
+
+
+
+get_best_assignments = function (Matrix[double] counts)
+return (Matrix[double] row_ids, Matrix[double] col_ids, Matrix[double] margins,
+ Matrix[double] max_counts, Matrix[double] rounded_percentages)
+{
+ margins = rowSums (counts);
+ select_positive = diag (ppred (margins, 0, ">"));
+ select_positive = removeEmpty (target = select_positive, margin = "rows");
+ row_ids = select_positive %*% seq (1, nrow (margins), 1);
+ pos_counts = select_positive %*% counts;
+ pos_margins = select_positive %*% margins;
+ max_counts = rowMaxs (pos_counts);
+ one_per_column = matrix (1, rows = 1, cols = ncol (pos_counts));
+ max_counts_ppred = max_counts %*% one_per_column;
+ is_max_count = ppred (pos_counts, max_counts_ppred, "==");
+ aggr_is_max_count = t(cumsum (t(is_max_count)));
+ col_ids = rowSums (ppred (aggr_is_max_count, 0, "==")) + 1;
+ rounded_percentages = round (1000000.0 * max_counts / pos_margins) / 10000.0;
+}
+