You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ac...@apache.org on 2017/05/12 17:01:34 UTC
incubator-systemml git commit: [SYSTEMML-1607] Add Linear Regression
Notebook example
Repository: incubator-systemml
Updated Branches:
refs/heads/master 7fe372b9a -> 6adcb369d
[SYSTEMML-1607] Add Linear Regression Notebook example
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/6adcb369
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/6adcb369
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/6adcb369
Branch: refs/heads/master
Commit: 6adcb369d3b9472cfe560c3dd7fef3be2a527bc5
Parents: 7fe372b
Author: Arvind Surve <ac...@yahoo.com>
Authored: Fri May 12 10:01:14 2017 -0700
Committer: Arvind Surve <ac...@yahoo.com>
Committed: Fri May 12 10:01:14 2017 -0700
----------------------------------------------------------------------
.../Linear_Regression_Algorithms_Demo.ipynb | 636 +++++++++++++++++++
1 file changed, 636 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/6adcb369/samples/jupyter-notebooks/Linear_Regression_Algorithms_Demo.ipynb
----------------------------------------------------------------------
diff --git a/samples/jupyter-notebooks/Linear_Regression_Algorithms_Demo.ipynb b/samples/jupyter-notebooks/Linear_Regression_Algorithms_Demo.ipynb
new file mode 100644
index 0000000..90a8048
--- /dev/null
+++ b/samples/jupyter-notebooks/Linear_Regression_Algorithms_Demo.ipynb
@@ -0,0 +1,636 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook shows:\n",
+ "- Install SystemML Python package and jar file\n",
+ " - pip\n",
+ " - SystemML 'Hello World'\n",
+ "- Example 1: Matrix Multiplication\n",
+ " - SystemML script to generate a random matrix, perform matrix multiplication, and compute the sum of the output\n",
+ " - Examine execution plans, and increase data size to obverve changed execution plans\n",
+ "- Load diabetes dataset from scikit-learn\n",
+ "- Example 2: Implement three different algorithms to train linear regression model\n",
+ " - Algorithm 1: Linear Regression - Direct Solve (no regularization)\n",
+ " - Algorithm 2: Linear Regression - Batch Gradient Descent (no regularization)\n",
+ " - Algorithm 3: Linear Regression - Conjugate Gradient (no regularization)\n",
+ "- Example 3: Invoke existing SystemML algorithm script LinearRegDS.dml using MLContext API\n",
+ "- Example 4: Invoke existing SystemML algorithm using scikit-learn/SparkML pipeline like API\n",
+ "- Uninstall/Clean up SystemML Python package and jar file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Install SystemML Python package and jar file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "!pip install ~/git/incubator-systemml/target/systemml-1.0.0-incubating-SNAPSHOT-python.tgz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "!pip show systemml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Import SystemML API "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from systemml import MLContext, dml, dmlFromResource\n",
+ "\n",
+ "ml = MLContext(sc)\n",
+ "\n",
+ "print \"Spark Version:\", sc.version\n",
+ "print \"SystemML Version:\", ml.version()\n",
+ "print \"SystemML Built-Time:\", ml.buildTime()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ml.execute(dml(\"\"\"s = 'Hello World!'\"\"\").output(\"s\")).get(\"s\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Import numpy, sklearn, and define some helper functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import sys, os, glob, subprocess\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from sklearn import datasets\n",
+ "plt.switch_backend('agg')\n",
+ " \n",
+ "def printLastLogLines(n):\n",
+ " fname = max(glob.iglob(os.sep.join([os.environ[\"HOME\"],'/logs/notebook/kernel-pyspark-*.log'])), key=os.path.getctime)\n",
+ " print(subprocess.check_output(['tail', '-' + str(n), fname]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "import sys, os\n",
+ "SCRIPTS = os.sep.join([os.environ[\"HOME\"],'anaconda', 'lib', 'python' + sys.version[:3], 'site-packages', 'systemml', 'systemml-java', 'scripts'])\n",
+ "print SCRIPTS"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Example 1: Matrix Multiplication"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### SystemML script to generate a random matrix, perform matrix multiplication, and compute the sum of the output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true,
+ "slideshow": {
+ "slide_type": "-"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "script = \"\"\"\n",
+ " X = rand(rows=$nr, cols=1000, sparsity=0.5)\n",
+ " A = t(X) %*% X\n",
+ " s = sum(A)\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prog = dml(script).input('$nr', 1e5).output('s')\n",
+ "s = ml.execute(prog).get('s')\n",
+ "print s"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Examine execution plans, and increase data size to obverve changed execution plans"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true,
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "ml = MLContext(sc)\n",
+ "ml = ml.setStatistics(True)\n",
+ "# re-execute ML program"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "printLastLogLines(22)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prog = dml(script).input('$nr', 1e6).output('s')\n",
+ "out = ml.execute(prog).get('s')\n",
+ "print out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "ml = MLContext(sc)\n",
+ "ml = ml.setStatistics(False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Load diabetes dataset from scikit-learn "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "diabetes = datasets.load_diabetes()\n",
+ "diabetes_X = diabetes.data[:, np.newaxis, 2]\n",
+ "diabetes_X_train = diabetes_X[:-20]\n",
+ "diabetes_X_test = diabetes_X[-20:]\n",
+ "diabetes_y_train = diabetes.target[:-20].reshape(-1,1)\n",
+ "diabetes_y_test = diabetes.target[-20:].reshape(-1,1)\n",
+ "\n",
+ "plt.scatter(diabetes_X_train, diabetes_y_train, color='black')\n",
+ "plt.scatter(diabetes_X_test, diabetes_y_test, color='red')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "diabetes.data.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Example 2: Implement three different algorithms to train linear regression model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "## Algorithm 1: Linear Regression - Direct Solve (no regularization) "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Least squares formulation\n",
+ "w* = argminw ||Xw-y||2 = argminw (y - Xw)'(y - Xw) = argminw (w'(X'X)w - w'(X'y))/2\n",
+ "\n",
+ "#### Setting the gradient\n",
+ "dw = (X'X)w - (X'y) to 0, w = (X'X)-1(X' y) = solve(X'X, X'y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "script = \"\"\"\n",
+ " # add constant feature to X to model intercept\n",
+ " X = cbind(X, matrix(1, rows=nrow(X), cols=1))\n",
+ " A = t(X) %*% X\n",
+ " b = t(X) %*% y\n",
+ " w = solve(A, b)\n",
+ " bias = as.scalar(w[nrow(w),1])\n",
+ " w = w[1:nrow(w)-1,]\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true,
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "prog = dml(script).input(X=diabetes_X_train, y=diabetes_y_train).output('w', 'bias')\n",
+ "w, bias = ml.execute(prog).get('w','bias')\n",
+ "w = w.toNumPy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "plt.scatter(diabetes_X_train, diabetes_y_train, color='black')\n",
+ "plt.scatter(diabetes_X_test, diabetes_y_test, color='red')\n",
+ "\n",
+ "plt.plot(diabetes_X_test, (w*diabetes_X_test)+bias, color='blue', linestyle ='dotted')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true
+ },
+ "source": [
+ "## Algorithm 2: Linear Regression - Batch Gradient Descent (no regularization)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Algorithm\n",
+ "`Step 1: Start with an initial point \n",
+ "while(not converged) { \n",
+ " Step 2: Compute gradient dw. \n",
+ " Step 3: Compute stepsize alpha. \n",
+ " Step 4: Update: wnew = wold + alpha*dw \n",
+ "}`\n",
+ "\n",
+ "#### Gradient formula\n",
+ "`dw = r = (X'X)w - (X'y)`\n",
+ "\n",
+ "#### Step size formula\n",
+ "`Find number alpha to minimize f(w + alpha*r) \n",
+ "alpha = -(r'r)/(r'X'Xr)`\n",
+ "\n",
+ "![Gradient Descent](http://blog.datumbox.com/wp-content/uploads/2013/10/gradient-descent.png)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "script = \"\"\"\n",
+ " # add constant feature to X to model intercepts\n",
+ " X = cbind(X, matrix(1, rows=nrow(X), cols=1))\n",
+ " max_iter = 100\n",
+ " w = matrix(0, rows=ncol(X), cols=1)\n",
+ " for(i in 1:max_iter){\n",
+ " XtX = t(X) %*% X\n",
+ " dw = XtX %*%w - t(X) %*% y\n",
+ " alpha = -(t(dw) %*% dw) / (t(dw) %*% XtX %*% dw)\n",
+ " w = w + dw*alpha\n",
+ " }\n",
+ " bias = as.scalar(w[nrow(w),1])\n",
+ " w = w[1:nrow(w)-1,] \n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "prog = dml(script).input(X=diabetes_X_train, y=diabetes_y_train).output('w').output('bias')\n",
+ "w, bias = ml.execute(prog).get('w', 'bias')\n",
+ "w = w.toNumPy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "plt.scatter(diabetes_X_train, diabetes_y_train, color='black')\n",
+ "plt.scatter(diabetes_X_test, diabetes_y_test, color='red')\n",
+ "\n",
+ "plt.plot(diabetes_X_test, (w*diabetes_X_test)+bias, color='red', linestyle ='dashed')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Algorithm 3: Linear Regression - Conjugate Gradient (no regularization)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Problem with gradient descent: Takes very similar directions many times\n",
+ "\n",
+ "Solution: Enforce conjugacy\n",
+ "\n",
+ "`Step 1: Start with an initial point \n",
+ "while(not converged) {\n",
+ " Step 2: Compute gradient dw.\n",
+ " Step 3: Compute stepsize alpha.\n",
+ " Step 4: Compute next direction p by enforcing conjugacy with previous direction.\n",
+ " Step 4: Update: w_new = w_old + alpha*p\n",
+ "}`\n",
+ "\n",
+ "![Gradient Descent vs Conjugate Gradient](http://i.stack.imgur.com/zh1HH.png)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "script = \"\"\"\n",
+ " # add constant feature to X to model intercepts\n",
+ " X = cbind(X, matrix(1, rows=nrow(X), cols=1))\n",
+ " m = ncol(X); i = 1; \n",
+ " max_iter = 20;\n",
+ " w = matrix (0, rows = m, cols = 1); # initialize weights to 0\n",
+ " dw = - t(X) %*% y; p = - dw; # dw = (X'X)w - (X'y)\n",
+ " norm_r2 = sum (dw ^ 2); \n",
+ " for(i in 1:max_iter) {\n",
+ " q = t(X) %*% (X %*% p)\n",
+ " alpha = norm_r2 / sum (p * q); # Minimizes f(w - alpha*r)\n",
+ " w = w + alpha * p; # update weights\n",
+ " dw = dw + alpha * q; \n",
+ " old_norm_r2 = norm_r2; norm_r2 = sum (dw ^ 2);\n",
+ " p = -dw + (norm_r2 / old_norm_r2) * p; # next direction - conjugacy to previous direction\n",
+ " i = i + 1;\n",
+ " }\n",
+ " bias = as.scalar(w[nrow(w),1])\n",
+ " w = w[1:nrow(w)-1,] \n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "prog = dml(script).input(X=diabetes_X_train, y=diabetes_y_train).output('w').output('bias')\n",
+ "w, bias = ml.execute(prog).get('w','bias')\n",
+ "w = w.toNumPy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "plt.scatter(diabetes_X_train, diabetes_y_train, color='black')\n",
+ "plt.scatter(diabetes_X_test, diabetes_y_test, color='red')\n",
+ "\n",
+ "plt.plot(diabetes_X_test, (w*diabetes_X_test)+bias, color='red', linestyle ='dashed')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Example 3: Invoke existing SystemML algorithm script LinearRegDS.dml using MLContext API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "prog = dml(\"/Users/user_name/git/incubator-systemml/scripts/algorithms/LinearRegDS.dml\").input(X=diabetes_X_train, y=diabetes_y_train).input('$icpt',1.0).output('beta_out')\n",
+ "w = ml.execute(prog).get('beta_out')\n",
+ "w = w.toNumPy()\n",
+ "bias=w[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.scatter(diabetes_X_train, diabetes_y_train, color='black')\n",
+ "plt.scatter(diabetes_X_test, diabetes_y_test, color='red')\n",
+ "\n",
+ "plt.plot(diabetes_X_test, (w[0]*diabetes_X_test)+bias, color='red', linestyle ='dashed')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Example 4: Invoke existing SystemML algorithm using scikit-learn/SparkML pipeline like API"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*mllearn* API allows a Python programmer to invoke SystemML's algorithms using scikit-learn like API as well as Spark's MLPipeline API."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "from pyspark.sql import SQLContext\n",
+ "from systemml.mllearn import LinearRegression\n",
+ "sqlCtx = SQLContext(sc)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "regr = LinearRegression(sqlCtx)\n",
+ "# Train the model using the training sets\n",
+ "regr.fit(diabetes_X_train, diabetes_y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "predictions = regr.predict(diabetes_X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Use the trained model to perform prediction\n",
+ "%matplotlib inline\n",
+ "plt.scatter(diabetes_X_train, diabetes_y_train, color='black')\n",
+ "plt.scatter(diabetes_X_test, diabetes_y_test, color='red')\n",
+ "\n",
+ "plt.plot(diabetes_X_test, predictions, color='black')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Uninstall/Clean up SystemML Python package and jar file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!yes | pip uninstall systemml"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}