You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by re...@apache.org on 2017/11/03 21:32:21 UTC
systemml git commit: [MINOR] added additional examples
Repository: systemml
Updated Branches:
refs/heads/master 0d4672207 -> f76f2138a
[MINOR] added additional examples
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/f76f2138
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/f76f2138
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/f76f2138
Branch: refs/heads/master
Commit: f76f2138a4847f2ca52b5bf511907f4838b240b8
Parents: 0d46722
Author: Berthold Reinwald <re...@us.ibm.com>
Authored: Fri Nov 3 14:29:13 2017 -0700
Committer: Berthold Reinwald <re...@us.ibm.com>
Committed: Fri Nov 3 14:29:13 2017 -0700
----------------------------------------------------------------------
...DML Tips and Tricks (aka Fun With DML).ipynb | 204 ++++++++++++++++++-
1 file changed, 199 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/f76f2138/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
----------------------------------------------------------------------
diff --git a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
index c0391ce..b2d2fad 100644
--- a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
+++ b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb
@@ -4,7 +4,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "1. [Cross Validation](#CrossValidation)\n",
+ "1. [Replace NaN with mode](#NaN2Mode)\n",
+ "* [Use sample builtin function to create sample from matrix](#sample)\n",
+ "* [Count of Matching Values in two Matrices/Vectors](#MatchinRows)\n",
+ "* [Cross Validation](#CrossValidation)\n",
"* [Value-based join of two Matrices](#JoinMatrices)\n",
"* [Filter Matrix to include only Frequent Column Values](#FilterMatrix)\n",
"* [Construct (sparse) Matrix from (rowIndex, colIndex, values) triplets](#Construct_sparse_Matrix)\n",
@@ -17,17 +20,17 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 15,
"metadata": {
"collapsed": false,
- "scrolled": true
+ "scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "2017-08-18 21:33:18 UTC\n"
+ "2017-09-22 07:57:57 UTC\n"
]
}
],
@@ -42,6 +45,197 @@
"cell_type": "markdown",
"metadata": {},
"source": [
+ "## Replace NaN with mode<a id=\"NaN2Mode\" />"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This functions replaces NaN in column with mode of column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Before: \n",
+ "1.000 NaN\n",
+ "1.000 NaN\n",
+ "1.000 2.000\n",
+ "2.000 1.000\n",
+ "1.000 2.000\n",
+ "\n",
+ "After: \n",
+ "1.000 2.000\n",
+ "1.000 2.000\n",
+ "1.000 2.000\n",
+ "2.000 1.000\n",
+ "1.000 2.000\n",
+ "\n",
+ "SystemML Statistics:\n",
+ "Total execution time:\t\t0.001 sec.\n",
+ "Number of executed Spark inst:\t0.\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "prog=\"\"\"\n",
+ "# Function for NaN-aware replacement with mode\n",
+ "replaceNaNwithMode = function (matrix[double] X, integer colId) \n",
+ " return (matrix[double] X) \n",
+ "{\n",
+ " Xi = replace (target=X[,colId], pattern=0/0, replacement=max(X[,colId])+1) # replace NaN with largest value + 1\n",
+ " agg = aggregate (target=Xi, groups=Xi, fn=\"count\") # count each distinct value\n",
+ " mode = as.scalar (rowIndexMax(t(agg[1:nrow(agg)-1, ]))) # mode is max frequent value except last value\n",
+ " X[,colId] = replace (target=Xi, pattern=max(Xi), replacement=mode) # fill in mode\n",
+ "}\n",
+ "\n",
+ "X = matrix('1 NaN 1 NaN 1 2 2 1 1 2', rows = 5, cols = 2)\n",
+ "\n",
+ "Y = replaceNaNwithMode (X, 2)\n",
+ "\n",
+ "print (\"Before: \\n\" + toString(X))\n",
+ "print (\"After: \\n\" + toString(Y))\n",
+ "\"\"\"\n",
+ "with jvm_stdout(True):\n",
+ " ml.execute(dml(prog))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Use sample builtin function to create sample from matrix<a id=\"sample\" />"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Use sample() function, create permutation matrix using table(), and pull sample from X."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "X: \n",
+ "2.000 1.000\n",
+ "8.000 3.000\n",
+ "5.000 6.000\n",
+ "7.000 9.000\n",
+ "4.000 4.000\n",
+ "\n",
+ "sv: \n",
+ "1.000\n",
+ "4.000\n",
+ "\n",
+ "samples: \n",
+ "2.000 1.000\n",
+ "7.000 9.000\n",
+ "\n",
+ "SystemML Statistics:\n",
+ "Total execution time:\t\t0.001 sec.\n",
+ "Number of executed Spark inst:\t0.\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "prog=\"\"\"\n",
+ "X = matrix ('2 1 8 3 5 6 7 9 4 4', rows = 5, cols = 2 )\n",
+ "\n",
+ "nbrSamples = 2\n",
+ "\n",
+ "sv = order (target = sample (nrow (X), nbrSamples, FALSE)) # samples w/o replacement, and order \n",
+ "P = table (seq (1, nbrSamples), sv, nbrSamples, nrow(X)) # permutation matrix\n",
+ "samples = P %*% X; # apply P to perform selection\n",
+ "\n",
+ "\n",
+ "print (\"X: \\n\" + toString(X))\n",
+ "print (\"sv: \\n\" + toString(sv))\n",
+ "print (\"samples: \\n\" + toString(samples))\n",
+ "\"\"\"\n",
+ "with jvm_stdout(True):\n",
+ " ml.execute(dml(prog))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Count of Matching Values in two Matrices/Vectors<a id=\"MatchingRows\" />"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Given two matrices/vectors X and Y, get a count of the rows where X and Y have the same value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "t(X): 8.000 4.000 5.000 4.000 9.000 10.000\n",
+ "\n",
+ "t(Y): 4.000 9.000 5.000 1.000 9.000 7.000\n",
+ "\n",
+ "Number of Matches: 2.0\n",
+ "\n",
+ "SystemML Statistics:\n",
+ "Total execution time:\t\t0.001 sec.\n",
+ "Number of executed Spark inst:\t0.\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "prog=\"\"\"\n",
+ "X = matrix('8 4 5 4 9 10', rows = 6, cols = 1)\n",
+ "Y = matrix('4 9 5 1 9 7 ', rows = 6, cols = 1)\n",
+ "\n",
+ "matches = sum (X == Y)\n",
+ "\n",
+ "print (\"t(X): \" + toString(t(X)))\n",
+ "print (\"t(Y): \" + toString(t(Y)))\n",
+ "print (\"Number of Matches: \" + matches + \"\\n\")\n",
+ "\"\"\"\n",
+ "with jvm_stdout(True):\n",
+ " ml.execute(dml(prog))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"## Cross Validation<a id=\"CrossValidation\" />"
]
},
@@ -56,7 +250,7 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {
- "collapsed": false
+ "collapsed": true
},
"outputs": [
{