You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by de...@apache.org on 2016/02/23 23:16:02 UTC

incubator-systemml git commit: [SYSTEMML-534] Add optional console output of stats to Univar-Stats.dml

Repository: incubator-systemml
Updated Branches:
  refs/heads/master a157d0812 -> d88aba81d


[SYSTEMML-534] Add optional console output of stats to Univar-Stats.dml

Add optional input parameter to print stats to console (default is off).
Update INPUT/OUTPUT/INVOKE comments at top of script.

Closes #74.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d88aba81
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d88aba81
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d88aba81

Branch: refs/heads/master
Commit: d88aba81ddf73805fc74b30dc942a6e200102186
Parents: a157d08
Author: Deron Eriksson <de...@us.ibm.com>
Authored: Tue Feb 23 14:13:02 2016 -0800
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Tue Feb 23 14:13:02 2016 -0800

----------------------------------------------------------------------
 scripts/algorithms/Univar-Stats.dml | 72 +++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d88aba81/scripts/algorithms/Univar-Stats.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Univar-Stats.dml b/scripts/algorithms/Univar-Stats.dml
index 62d6a28..404e002 100644
--- a/scripts/algorithms/Univar-Stats.dml
+++ b/scripts/algorithms/Univar-Stats.dml
@@ -20,27 +20,28 @@
 #-------------------------------------------------------------
 
 #
-# DML Script to compute univariate statistics for all attributes 
-# in a given data set
+# DML Script to compute univariate statistics for all attributes in a given data set
 #
-# Three inputs:
-#     $1) X - input data
-#     $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes
-#             kind=1 for scale, 
-#             kind=2 for nominal,
-#             kind=3 for ordinal
-#
-# One output:
-#     $STATS) output directory in which following three statistics 
-#         files are created
-#         + base.stats - matrix with all 17 statistics (14 scale, 
-#         3 categorical) computed for all attributes
-#         + categorical.counts - matrix in which each column 
-#         gives the category-wise counts for all categories in 
-#         that attribute
+# INPUT PARAMETERS:
+# -------------------------------------------------------------------------------------------------
+# NAME           TYPE     DEFAULT  MEANING
+# -------------------------------------------------------------------------------------------------
+# X              String   ---      Location of INPUT data matrix
+# TYPES          String   ---      Location of INPUT matrix that lists the types of the features:
+#                                     1 for scale, 2 for nominal, 3 for ordinal
+# CONSOLE_OUTPUT Boolean  FALSE    If TRUE, print summary statistics to console
+# STATS          String   ---      Location of OUTPUT matrix with summary statistics computed for
+#                                  all features (17 statistics - 14 scale, 3 categorical)
+# -------------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of summary statistics
 #
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv
+#    STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE
 #
 
+consoleOutput = ifdef($CONSOLE_OUTPUT, FALSE);
+
 A = read($X); # data file
 K = read($TYPES); # attribute kind file
 
@@ -63,7 +64,6 @@ maxs = colMaxs(A);
 maxDomainSize = max( ppred(K, 1, ">") * maxs );
 maxDomain = as.integer(maxDomainSize);
 
-
 parfor(i in 1:n, check=0) {
 
 	# project out the i^th column
@@ -146,5 +146,39 @@ parfor(i in 1:n, check=0) {
 	}
 }
 
-write(baseStats, $STATS);
+if (consoleOutput == TRUE) {
+	for(i in 1:n) {
+		print("-------------------------------------------------");
+		kind = castAsScalar(K[1,i]);
+		if (kind == 1) {
+			print("Feature [" + i + "]: Scale");
+			print(" (01) Minimum             | " + as.scalar(baseStats[1,i]));
+			print(" (02) Maximum             | " + as.scalar(baseStats[2,i]));
+			print(" (03) Range               | " + as.scalar(baseStats[3,i]));
+			print(" (04) Mean                | " + as.scalar(baseStats[4,i]));
+			print(" (05) Variance            | " + as.scalar(baseStats[5,i]));
+			print(" (06) Std deviation       | " + as.scalar(baseStats[6,i]));
+			print(" (07) Std err of mean     | " + as.scalar(baseStats[7,i]));
+			print(" (08) Coeff of variation  | " + as.scalar(baseStats[8,i]));
+			print(" (09) Skewness            | " + as.scalar(baseStats[9,i]));
+			print(" (10) Kurtosis            | " + as.scalar(baseStats[10,i]));
+			print(" (11) Std err of skewness | " + as.scalar(baseStats[11,i]));
+			print(" (12) Std err of kurtosis | " + as.scalar(baseStats[12,i]));
+			print(" (13) Median              | " + as.scalar(baseStats[13,i]));
+			print(" (14) Interquartile mean  | " + as.scalar(baseStats[14,i]));
+		} else {
+			if (kind == 2 | kind == 3) {
+				if (kind == 2) {
+					print("Feature [" + i + "]: Categorical (Nominal)");
+				} else {
+					print("Feature [" + i + "]: Categorical (Ordinal)");
+				}
+				print(" (15) Num of categories   | " + as.integer(as.scalar(baseStats[15,i])));
+				print(" (16) Mode                | " + as.integer(as.scalar(baseStats[16,i])));
+				print(" (17) Num of modes        | " + as.integer(as.scalar(baseStats[17,i])));
+			}
+		}
+	}
+}
 
+write(baseStats, $STATS);