You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by re...@apache.org on 2017/11/03 21:32:21 UTC
systemml git commit: [MINOR] added additional examples

Repository: systemml
Updated Branches:
  refs/heads/master 0d4672207 -> f76f2138a


[MINOR] added additional examples


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f76f2138
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f76f2138
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f76f2138

Branch: refs/heads/master
Commit: f76f2138a4847f2ca52b5bf511907f4838b240b8
Parents: 0d46722
Author: Berthold Reinwald <re...@us.ibm.com>
Authored: Fri Nov 3 14:29:13 2017 -0700
Committer: Berthold Reinwald <re...@us.ibm.com>
Committed: Fri Nov 3 14:29:13 2017 -0700

----------------------------------------------------------------------
 ...DML Tips and Tricks (aka Fun With DML).ipynb | 204 ++++++++++++++++++-
 1 file changed, 199 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/f76f2138/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
----------------------------------------------------------------------
diff --git a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
index c0391ce..b2d2fad 100644
--- a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb	
+++ b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb	
@@ -4,7 +4,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "1. [Cross Validation](#CrossValidation)\n",
+    "1. [Replace NaN with mode](#NaN2Mode)\n",
+    "* [Use sample builtin function to create sample from matrix](#sample)\n",
+    "* [Count of Matching Values in two Matrices/Vectors](#MatchinRows)\n",
+    "* [Cross Validation](#CrossValidation)\n",
     "* [Value-based join of two Matrices](#JoinMatrices)\n",
     "* [Filter Matrix to include only Frequent Column Values](#FilterMatrix)\n",
     "* [Construct (sparse) Matrix from (rowIndex, colIndex, values) triplets](#Construct_sparse_Matrix)\n",
@@ -17,17 +20,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 15,
    "metadata": {
     "collapsed": false,
-    "scrolled": true
+    "scrolled": false
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2017-08-18 21:33:18 UTC\n"
+      "2017-09-22 07:57:57 UTC\n"
      ]
     }
    ],
@@ -42,6 +45,197 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Replace NaN with mode<a id=\"NaN2Mode\" />"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This functions replaces NaN in column with mode of column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Before: \n",
+      "1.000 NaN\n",
+      "1.000 NaN\n",
+      "1.000 2.000\n",
+      "2.000 1.000\n",
+      "1.000 2.000\n",
+      "\n",
+      "After: \n",
+      "1.000 2.000\n",
+      "1.000 2.000\n",
+      "1.000 2.000\n",
+      "2.000 1.000\n",
+      "1.000 2.000\n",
+      "\n",
+      "SystemML Statistics:\n",
+      "Total execution time:\t\t0.001 sec.\n",
+      "Number of executed Spark inst:\t0.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "prog=\"\"\"\n",
+    "# Function for NaN-aware replacement with mode\n",
+    "replaceNaNwithMode = function (matrix[double] X, integer colId) \n",
+    "      return (matrix[double] X) \n",
+    "{\n",
+    "   Xi = replace (target=X[,colId], pattern=0/0, replacement=max(X[,colId])+1)   # replace NaN with largest value + 1\n",
+    "   agg = aggregate (target=Xi, groups=Xi, fn=\"count\")                           # count each distinct value\n",
+    "   mode = as.scalar (rowIndexMax(t(agg[1:nrow(agg)-1, ])))                      # mode is max frequent value except last value\n",
+    "   X[,colId] = replace (target=Xi, pattern=max(Xi), replacement=mode)           # fill in mode\n",
+    "}\n",
+    "\n",
+    "X = matrix('1 NaN 1 NaN 1 2 2 1 1 2', rows = 5, cols = 2)\n",
+    "\n",
+    "Y = replaceNaNwithMode (X, 2)\n",
+    "\n",
+    "print (\"Before: \\n\" + toString(X))\n",
+    "print (\"After: \\n\" + toString(Y))\n",
+    "\"\"\"\n",
+    "with jvm_stdout(True):\n",
+    "    ml.execute(dml(prog))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Use sample builtin function to create sample from matrix<a id=\"sample\" />"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use sample() function, create permutation matrix using table(), and pull sample from X."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "X: \n",
+      "2.000 1.000\n",
+      "8.000 3.000\n",
+      "5.000 6.000\n",
+      "7.000 9.000\n",
+      "4.000 4.000\n",
+      "\n",
+      "sv: \n",
+      "1.000\n",
+      "4.000\n",
+      "\n",
+      "samples: \n",
+      "2.000 1.000\n",
+      "7.000 9.000\n",
+      "\n",
+      "SystemML Statistics:\n",
+      "Total execution time:\t\t0.001 sec.\n",
+      "Number of executed Spark inst:\t0.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "prog=\"\"\"\n",
+    "X = matrix ('2 1 8 3 5 6 7 9 4 4', rows = 5, cols = 2 )\n",
+    "\n",
+    "nbrSamples = 2\n",
+    "\n",
+    "sv = order (target = sample (nrow (X), nbrSamples, FALSE))  # samples w/o replacement, and order            \n",
+    "P = table (seq (1, nbrSamples), sv, nbrSamples, nrow(X))    # permutation matrix\n",
+    "samples = P %*% X;                                          # apply P to perform selection\n",
+    "\n",
+    "\n",
+    "print (\"X: \\n\" + toString(X))\n",
+    "print (\"sv: \\n\" + toString(sv))\n",
+    "print (\"samples: \\n\" + toString(samples))\n",
+    "\"\"\"\n",
+    "with jvm_stdout(True):\n",
+    "    ml.execute(dml(prog))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Count of Matching Values in two Matrices/Vectors<a id=\"MatchingRows\" />"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Given two matrices/vectors X and Y, get a count of the rows where X and Y have the same value."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "t(X): 8.000 4.000 5.000 4.000 9.000 10.000\n",
+      "\n",
+      "t(Y): 4.000 9.000 5.000 1.000 9.000 7.000\n",
+      "\n",
+      "Number of Matches: 2.0\n",
+      "\n",
+      "SystemML Statistics:\n",
+      "Total execution time:\t\t0.001 sec.\n",
+      "Number of executed Spark inst:\t0.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "prog=\"\"\"\n",
+    "X = matrix('8 4 5 4 9 10', rows = 6, cols = 1)\n",
+    "Y = matrix('4 9 5 1 9 7 ', rows = 6, cols = 1)\n",
+    "\n",
+    "matches = sum (X == Y)\n",
+    "\n",
+    "print (\"t(X): \" + toString(t(X)))\n",
+    "print (\"t(Y): \" + toString(t(Y)))\n",
+    "print (\"Number of Matches: \" + matches + \"\\n\")\n",
+    "\"\"\"\n",
+    "with jvm_stdout(True):\n",
+    "    ml.execute(dml(prog))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "## Cross Validation<a id=\"CrossValidation\" />"
    ]
   },
@@ -56,7 +250,7 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {
-    "collapsed": false
+    "collapsed": true
    },
    "outputs": [
     {