You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by re...@apache.org on 2017/09/11 22:47:13 UTC
systemml git commit: [Minor] added cross validation example
Repository: systemml
Updated Branches:
refs/heads/master 754548190 -> cddd2a4f6
[Minor] added cross validation example
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cddd2a4f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cddd2a4f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cddd2a4f
Branch: refs/heads/master
Commit: cddd2a4f60e22e8b621712135e5ed263b25343c0
Parents: 7545481
Author: Berthold Reinwald <re...@us.ibm.com>
Authored: Mon Sep 11 15:30:29 2017 -0700
Committer: Berthold Reinwald <re...@us.ibm.com>
Committed: Mon Sep 11 15:46:52 2017 -0700
----------------------------------------------------------------------
...DML Tips and Tricks (aka Fun With DML).ipynb | 189 ++++++++++++++++++-
1 file changed, 186 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/cddd2a4f/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
----------------------------------------------------------------------
diff --git a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
index 23d975a..c0391ce 100644
--- a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
+++ b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
@@ -4,7 +4,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "1. [Value-based join of two Matrices](#JoinMatrices)\n",
+ "1. [Cross Validation](#CrossValidation)\n",
+ "* [Value-based join of two Matrices](#JoinMatrices)\n",
"* [Filter Matrix to include only Frequent Column Values](#FilterMatrix)\n",
"* [Construct (sparse) Matrix from (rowIndex, colIndex, values) triplets](#Construct_sparse_Matrix)\n",
"* [Find and remove duplicates in columns or rows](#Find_and_remove_duplicates)\n",
@@ -16,12 +17,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
"collapsed": false,
"scrolled": true
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2017-08-18 21:33:18 UTC\n"
+ ]
+ }
+ ],
"source": [
"from systemml import MLContext, dml, jvm_stdout\n",
"ml = MLContext(sc)\n",
@@ -33,6 +42,180 @@
"cell_type": "markdown",
"metadata": {},
"source": [
+ "## Cross Validation<a id=\"CrossValidation\" />"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Perform kFold cross validation by running in parallel fold creation, training algorithm, test algorithm, and evaluation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Test data Xyi2\n",
+ "10.000 11.000 12.000 4.000\n",
+ "16.000 17.000 18.000 6.000\n",
+ "\n",
+ "Train data Xyni2\n",
+ "1.000 2.000 3.000 1.000\n",
+ "4.000 5.000 6.000 2.000\n",
+ "7.000 8.000 9.000 3.000\n",
+ "13.000 14.000 15.000 5.000\n",
+ "\n",
+ "w2\n",
+ "95.000\n",
+ "106.000\n",
+ "117.000\n",
+ "\n",
+ "stats2\n",
+ "8938.000\n",
+ "\n",
+ "\n",
+ "Test data Xyi3\n",
+ "1.000 2.000 3.000 1.000\n",
+ "7.000 8.000 9.000 3.000\n",
+ "\n",
+ "Train data Xyni3\n",
+ "4.000 5.000 6.000 2.000\n",
+ "10.000 11.000 12.000 4.000\n",
+ "13.000 14.000 15.000 5.000\n",
+ "16.000 17.000 18.000 6.000\n",
+ "\n",
+ "w3\n",
+ "209.000\n",
+ "226.000\n",
+ "243.000\n",
+ "\n",
+ "stats3\n",
+ "6844.000\n",
+ "\n",
+ "\n",
+ "Test data Xyi1\n",
+ "4.000 5.000 6.000 2.000\n",
+ "13.000 14.000 15.000 5.000\n",
+ "\n",
+ "Train data Xyni1\n",
+ "1.000 2.000 3.000 1.000\n",
+ "7.000 8.000 9.000 3.000\n",
+ "10.000 11.000 12.000 4.000\n",
+ "16.000 17.000 18.000 6.000\n",
+ "\n",
+ "w1\n",
+ "158.000\n",
+ "172.000\n",
+ "186.000\n",
+ "\n",
+ "stats1\n",
+ "9853.000\n",
+ "\n",
+ "\n",
+ "SV selection vector:\n",
+ "3.000\n",
+ "1.000\n",
+ "3.000\n",
+ "2.000\n",
+ "1.000\n",
+ "2.000\n",
+ "\n",
+ "SystemML Statistics:\n",
+ "Total execution time:\t\t0.024 sec.\n",
+ "Number of executed Spark inst:\t0.\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "prog = \"\"\"\n",
+ "holdOut = 1/3\n",
+ "kFolds = 1/holdOut\n",
+ "\n",
+ "nRows = 6; nCols = 3; \n",
+ "\n",
+ "X = matrix(seq(1, nRows * nCols), rows = nRows, cols = nCols) # X data\n",
+ "y = matrix(seq(1, nRows), rows = nRows, cols = 1) # y label data\n",
+ "Xy = cbind (X,y) # Xy Data for CV\n",
+ "\n",
+ "sv = rand (rows = nRows, cols = 1, min = 0.0, max = 1.0, pdf = \"uniform\") # sv selection vector for fold creation \n",
+ "sv = (order(target=sv, by=1, index.return=TRUE)) %% kFolds + 1 # with numbers between 1 .. kFolds \n",
+ "\n",
+ "stats = matrix(0, rows=kFolds, cols=1) # stats per kFolds model on test data\n",
+ "\n",
+ "parfor (i in 1:kFolds)\n",
+ "{\n",
+ " # Skip empty training data or test data. \n",
+ " if ( sum (sv == i) > 0 & sum (sv == i) < nrow(X) ) \n",
+ " {\n",
+ " Xyi = removeEmpty(target = Xy, margin = \"rows\", select = (sv == i)) # Xyi fold, i.e. 1/k of rows (test data)\n",
+ " Xyni = removeEmpty(target = Xy, margin = \"rows\", select = (sv != i)) # Xyni data, i.e. (k-1)/k of rows (train data)\n",
+ "\n",
+ " # Skip extreme label inbalance\n",
+ " distinctLabels = aggregate( target = Xyni[,1], groups = Xyni[,1], fn = \"count\")\n",
+ " if ( nrow(distinctLabels) > 1)\n",
+ " {\n",
+ " wi = trainAlg (Xyni[ ,1:ncol(Xy)-1], Xyni[ ,ncol(Xy)]) # wi Model for i-th training data\n",
+ " pi = testAlg (Xyi [ ,1:ncol(Xy)-1], wi) # pi Prediction for i-th test data\n",
+ " ei = evalPrediction (pi, Xyi[ ,ncol(Xy)]) # stats[i,] evaluation of prediction of i-th fold\n",
+ " stats[i,] = ei\n",
+ " \n",
+ " print ( \"Test data Xyi\" + i + \"\\n\" + toString(Xyi) \n",
+ " + \"\\nTrain data Xyni\" + i + \"\\n\" + toString(Xyni) \n",
+ " + \"\\nw\" + i + \"\\n\" + toString(wi) \n",
+ " + \"\\nstats\" + i + \"\\n\" + toString(stats[i,]) \n",
+ " + \"\\n\")\n",
+ " }\n",
+ " else\n",
+ " {\n",
+ " print (\"Training data for fold \" + i + \" has only \" + nrow(distinctLabels) + \" distinct labels. Needs to be > 1.\")\n",
+ " } \n",
+ " } \n",
+ " else \n",
+ " {\n",
+ " print (\"Training data or test data for fold \" + i + \" is empty. Fold not validated.\")\n",
+ " }\n",
+ "\n",
+ "}\n",
+ "\n",
+ "print (\"SV selection vector:\\n\" + toString(sv))\n",
+ "\n",
+ "trainAlg = function (matrix[double] X, matrix[double] y)\n",
+ " return (matrix[double] w)\n",
+ "{\n",
+ " w = t(X) %*% y\n",
+ "}\n",
+ "\n",
+ "testAlg = function (matrix[double] X, matrix[double] w)\n",
+ " return (matrix[double] p)\n",
+ "{\n",
+ " p = X %*% w\n",
+ "}\n",
+ "\n",
+ "evalPrediction = function (matrix[double] p, matrix[double] y)\n",
+ " return (matrix[double] e)\n",
+ "{\n",
+ " e = as.matrix(sum (p - y))\n",
+ "}\n",
+ "\"\"\"\n",
+ "\n",
+ "with jvm_stdout(True):\n",
+ " ml.execute(dml(prog))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"## Value-based join of two Matrices<a id=\"JoinMatrices\"/>"
]
},