You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by re...@apache.org on 2017/09/11 22:47:13 UTC
systemml git commit: [Minor] added cross validation example

Repository: systemml
Updated Branches:
  refs/heads/master 754548190 -> cddd2a4f6


[Minor] added cross validation example

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cddd2a4f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cddd2a4f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cddd2a4f

Branch: refs/heads/master
Commit: cddd2a4f60e22e8b621712135e5ed263b25343c0
Parents: 7545481
Author: Berthold Reinwald <re...@us.ibm.com>
Authored: Mon Sep 11 15:30:29 2017 -0700
Committer: Berthold Reinwald <re...@us.ibm.com>
Committed: Mon Sep 11 15:46:52 2017 -0700

----------------------------------------------------------------------
 ...DML Tips and Tricks (aka Fun With DML).ipynb | 189 ++++++++++++++++++-
 1 file changed, 186 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/cddd2a4f/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
----------------------------------------------------------------------
diff --git a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
index 23d975a..c0391ce 100644
--- a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb	
+++ b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb	
@@ -4,7 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "1. [Value-based join of two Matrices](#JoinMatrices)\n",
+    "1. [Cross Validation](#CrossValidation)\n",
+    "* [Value-based join of two Matrices](#JoinMatrices)\n",
     "* [Filter Matrix to include only Frequent Column Values](#FilterMatrix)\n",
     "* [Construct (sparse) Matrix from (rowIndex, colIndex, values) triplets](#Construct_sparse_Matrix)\n",
     "* [Find and remove duplicates in columns or rows](#Find_and_remove_duplicates)\n",
@@ -16,12 +17,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "collapsed": false,
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2017-08-18 21:33:18 UTC\n"
+     ]
+    }
+   ],
    "source": [
     "from systemml import MLContext, dml, jvm_stdout\n",
     "ml = MLContext(sc)\n",
@@ -33,6 +42,180 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Cross Validation<a id=\"CrossValidation\" />"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform kFold cross validation by running in parallel fold creation, training algorithm, test algorithm, and evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test data Xyi2\n",
+      "10.000 11.000 12.000 4.000\n",
+      "16.000 17.000 18.000 6.000\n",
+      "\n",
+      "Train data Xyni2\n",
+      "1.000 2.000 3.000 1.000\n",
+      "4.000 5.000 6.000 2.000\n",
+      "7.000 8.000 9.000 3.000\n",
+      "13.000 14.000 15.000 5.000\n",
+      "\n",
+      "w2\n",
+      "95.000\n",
+      "106.000\n",
+      "117.000\n",
+      "\n",
+      "stats2\n",
+      "8938.000\n",
+      "\n",
+      "\n",
+      "Test data Xyi3\n",
+      "1.000 2.000 3.000 1.000\n",
+      "7.000 8.000 9.000 3.000\n",
+      "\n",
+      "Train data Xyni3\n",
+      "4.000 5.000 6.000 2.000\n",
+      "10.000 11.000 12.000 4.000\n",
+      "13.000 14.000 15.000 5.000\n",
+      "16.000 17.000 18.000 6.000\n",
+      "\n",
+      "w3\n",
+      "209.000\n",
+      "226.000\n",
+      "243.000\n",
+      "\n",
+      "stats3\n",
+      "6844.000\n",
+      "\n",
+      "\n",
+      "Test data Xyi1\n",
+      "4.000 5.000 6.000 2.000\n",
+      "13.000 14.000 15.000 5.000\n",
+      "\n",
+      "Train data Xyni1\n",
+      "1.000 2.000 3.000 1.000\n",
+      "7.000 8.000 9.000 3.000\n",
+      "10.000 11.000 12.000 4.000\n",
+      "16.000 17.000 18.000 6.000\n",
+      "\n",
+      "w1\n",
+      "158.000\n",
+      "172.000\n",
+      "186.000\n",
+      "\n",
+      "stats1\n",
+      "9853.000\n",
+      "\n",
+      "\n",
+      "SV selection vector:\n",
+      "3.000\n",
+      "1.000\n",
+      "3.000\n",
+      "2.000\n",
+      "1.000\n",
+      "2.000\n",
+      "\n",
+      "SystemML Statistics:\n",
+      "Total execution time:\t\t0.024 sec.\n",
+      "Number of executed Spark inst:\t0.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "prog = \"\"\"\n",
+    "holdOut = 1/3\n",
+    "kFolds = 1/holdOut\n",
+    "\n",
+    "nRows = 6; nCols = 3; \n",
+    "\n",
+    "X = matrix(seq(1, nRows * nCols), rows = nRows, cols = nCols)             # X data\n",
+    "y = matrix(seq(1, nRows), rows = nRows, cols = 1)                         # y label data\n",
+    "Xy = cbind (X,y)                                                          # Xy Data for CV\n",
+    "\n",
+    "sv = rand (rows = nRows, cols = 1, min = 0.0, max = 1.0, pdf = \"uniform\") # sv selection vector for fold creation \n",
+    "sv = (order(target=sv, by=1, index.return=TRUE)) %% kFolds + 1            #    with numbers between 1 .. kFolds \n",
+    "\n",
+    "stats = matrix(0, rows=kFolds, cols=1)                                    # stats per kFolds model on test data\n",
+    "\n",
+    "parfor (i in 1:kFolds)\n",
+    "{\n",
+    "   # Skip empty training data or test data. \n",
+    "   if  ( sum (sv == i) > 0 & sum (sv == i) < nrow(X) )    \n",
+    "   {\n",
+    "      Xyi  = removeEmpty(target = Xy, margin = \"rows\", select = (sv == i))  # Xyi fold, i.e. 1/k of rows (test data)\n",
+    "      Xyni = removeEmpty(target = Xy, margin = \"rows\", select = (sv != i))  # Xyni data, i.e. (k-1)/k of rows (train data)\n",
+    "\n",
+    "      # Skip extreme label inbalance\n",
+    "      distinctLabels = aggregate( target = Xyni[,1], groups = Xyni[,1], fn = \"count\")\n",
+    "      if ( nrow(distinctLabels) > 1)\n",
+    "      {\n",
+    "         wi = trainAlg (Xyni[ ,1:ncol(Xy)-1], Xyni[ ,ncol(Xy)])             # wi Model for i-th training data\n",
+    "         pi = testAlg  (Xyi [ ,1:ncol(Xy)-1], wi)                           # pi Prediction for i-th test data\n",
+    "         ei = evalPrediction (pi, Xyi[ ,ncol(Xy)])                          # stats[i,] evaluation of prediction of i-th fold\n",
+    "         stats[i,] = ei\n",
+    "    \n",
+    "         print (  \"Test data Xyi\" + i + \"\\n\" + toString(Xyi)  \n",
+    "               + \"\\nTrain data Xyni\" + i + \"\\n\" + toString(Xyni)  \n",
+    "               + \"\\nw\" + i + \"\\n\" + toString(wi) \n",
+    "               + \"\\nstats\" + i + \"\\n\" + toString(stats[i,]) \n",
+    "               + \"\\n\")\n",
+    "      }\n",
+    "      else\n",
+    "      {\n",
+    "        print (\"Training data for fold \" + i + \" has only \" + nrow(distinctLabels) + \" distinct labels. Needs to be > 1.\")\n",
+    "      }    \n",
+    "   } \n",
+    "   else \n",
+    "   {\n",
+    "      print (\"Training data or test data for fold \" + i + \" is empty. Fold not validated.\")\n",
+    "   }\n",
+    "\n",
+    "}\n",
+    "\n",
+    "print (\"SV selection vector:\\n\" + toString(sv))\n",
+    "\n",
+    "trainAlg = function (matrix[double] X, matrix[double] y)\n",
+    "  return (matrix[double] w)\n",
+    "{\n",
+    "   w = t(X) %*% y\n",
+    "}\n",
+    "\n",
+    "testAlg = function (matrix[double] X, matrix[double] w)\n",
+    "  return (matrix[double] p)\n",
+    "{\n",
+    "   p = X %*% w\n",
+    "}\n",
+    "\n",
+    "evalPrediction = function (matrix[double] p, matrix[double] y)\n",
+    "  return (matrix[double] e)\n",
+    "{\n",
+    "   e = as.matrix(sum (p - y))\n",
+    "}\n",
+    "\"\"\"\n",
+    "\n",
+    "with jvm_stdout(True):\n",
+    "    ml.execute(dml(prog))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "## Value-based join of two Matrices<a id=\"JoinMatrices\"/>"
    ]
   },