You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2016/02/23 23:16:02 UTC
incubator-systemml git commit: [SYSTEMML-534] Add optional console
output of stats to Univar-Stats.dml
Repository: incubator-systemml
Updated Branches:
refs/heads/master a157d0812 -> d88aba81d
[SYSTEMML-534] Add optional console output of stats to Univar-Stats.dml
Add optional input parameter to print stats to console (default is off).
Update INPUT/OUTPUT/INVOKE comments at top of script.
Closes #74.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d88aba81
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d88aba81
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d88aba81
Branch: refs/heads/master
Commit: d88aba81ddf73805fc74b30dc942a6e200102186
Parents: a157d08
Author: Deron Eriksson <de...@us.ibm.com>
Authored: Tue Feb 23 14:13:02 2016 -0800
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Tue Feb 23 14:13:02 2016 -0800
----------------------------------------------------------------------
scripts/algorithms/Univar-Stats.dml | 72 +++++++++++++++++++++++---------
1 file changed, 53 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d88aba81/scripts/algorithms/Univar-Stats.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Univar-Stats.dml b/scripts/algorithms/Univar-Stats.dml
index 62d6a28..404e002 100644
--- a/scripts/algorithms/Univar-Stats.dml
+++ b/scripts/algorithms/Univar-Stats.dml
@@ -20,27 +20,28 @@
#-------------------------------------------------------------
#
-# DML Script to compute univariate statistics for all attributes
-# in a given data set
+# DML Script to compute univariate statistics for all attributes in a given data set
#
-# Three inputs:
-# $1) X - input data
-# $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes
-# kind=1 for scale,
-# kind=2 for nominal,
-# kind=3 for ordinal
-#
-# One output:
-# $STATS) output directory in which following three statistics
-# files are created
-# + base.stats - matrix with all 17 statistics (14 scale,
-# 3 categorical) computed for all attributes
-# + categorical.counts - matrix in which each column
-# gives the category-wise counts for all categories in
-# that attribute
+# INPUT PARAMETERS:
+# -------------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# -------------------------------------------------------------------------------------------------
+# X String --- Location of INPUT data matrix
+# TYPES String --- Location of INPUT matrix that lists the types of the features:
+# 1 for scale, 2 for nominal, 3 for ordinal
+# CONSOLE_OUTPUT Boolean FALSE If TRUE, print summary statistics to console
+# STATS String --- Location of OUTPUT matrix with summary statistics computed for
+# all features (17 statistics - 14 scale, 3 categorical)
+# -------------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of summary statistics
#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv
+# STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE
#
+consoleOutput = ifdef($CONSOLE_OUTPUT, FALSE);
+
A = read($X); # data file
K = read($TYPES); # attribute kind file
@@ -63,7 +64,6 @@ maxs = colMaxs(A);
maxDomainSize = max( ppred(K, 1, ">") * maxs );
maxDomain = as.integer(maxDomainSize);
-
parfor(i in 1:n, check=0) {
# project out the i^th column
@@ -146,5 +146,39 @@ parfor(i in 1:n, check=0) {
}
}
-write(baseStats, $STATS);
+if (consoleOutput == TRUE) {
+ for(i in 1:n) {
+ print("-------------------------------------------------");
+ kind = castAsScalar(K[1,i]);
+ if (kind == 1) {
+ print("Feature [" + i + "]: Scale");
+ print(" (01) Minimum | " + as.scalar(baseStats[1,i]));
+ print(" (02) Maximum | " + as.scalar(baseStats[2,i]));
+ print(" (03) Range | " + as.scalar(baseStats[3,i]));
+ print(" (04) Mean | " + as.scalar(baseStats[4,i]));
+ print(" (05) Variance | " + as.scalar(baseStats[5,i]));
+ print(" (06) Std deviation | " + as.scalar(baseStats[6,i]));
+ print(" (07) Std err of mean | " + as.scalar(baseStats[7,i]));
+ print(" (08) Coeff of variation | " + as.scalar(baseStats[8,i]));
+ print(" (09) Skewness | " + as.scalar(baseStats[9,i]));
+ print(" (10) Kurtosis | " + as.scalar(baseStats[10,i]));
+ print(" (11) Std err of skewness | " + as.scalar(baseStats[11,i]));
+ print(" (12) Std err of kurtosis | " + as.scalar(baseStats[12,i]));
+ print(" (13) Median | " + as.scalar(baseStats[13,i]));
+ print(" (14) Interquartile mean | " + as.scalar(baseStats[14,i]));
+ } else {
+ if (kind == 2 | kind == 3) {
+ if (kind == 2) {
+ print("Feature [" + i + "]: Categorical (Nominal)");
+ } else {
+ print("Feature [" + i + "]: Categorical (Ordinal)");
+ }
+ print(" (15) Num of categories | " + as.integer(as.scalar(baseStats[15,i])));
+ print(" (16) Mode | " + as.integer(as.scalar(baseStats[16,i])));
+ print(" (17) Num of modes | " + as.integer(as.scalar(baseStats[17,i])));
+ }
+ }
+ }
+}
+write(baseStats, $STATS);