You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/09/02 00:21:04 UTC
incubator-systemml git commit: [SYSTEMML-878] Update the Python
package from SystemML to systemml
Repository: incubator-systemml
Updated Branches:
refs/heads/gh-pages c08740043 -> 298a9e7b9
[SYSTEMML-878] Update the Python package from SystemML to systemml
- Updated Python package name from SystemML to systemml
- Moved uploadToPyPI.sh script to dev/release
- Updated the documentation
Closes #231.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/298a9e7b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/298a9e7b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/298a9e7b
Branch: refs/heads/gh-pages
Commit: 298a9e7b979296310d8ed5826e6224f77eb96941
Parents: c087400
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 1 16:55:57 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 1 16:55:57 2016 -0700
----------------------------------------------------------------------
algorithms-classification.md | 18 +++++++-------
algorithms-regression.md | 8 +++----
beginners-guide-python.md | 49 +++++++++++++++++++++++++++++----------
3 files changed, 50 insertions(+), 25 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/298a9e7b/algorithms-classification.md
----------------------------------------------------------------------
diff --git a/algorithms-classification.md b/algorithms-classification.md
index 340267c..8d19d04 100644
--- a/algorithms-classification.md
+++ b/algorithms-classification.md
@@ -129,7 +129,7 @@ Eqs.�(1) and�(2).
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
# C = 1/reg
logistic = LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -237,7 +237,7 @@ SystemML Language Reference for details.
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
@@ -253,7 +253,7 @@ print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_te
# MLPipeline way
from pyspark.ml import Pipeline
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
@@ -498,7 +498,7 @@ support vector machine (`y` with domain size `2`).
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
# C = 1/reg
svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -766,7 +766,7 @@ class labels.
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
# C = 1/reg
svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -916,7 +916,7 @@ SystemML Language Reference for details.
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
@@ -932,7 +932,7 @@ print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y
# MLPipeline way
from pyspark.ml import Pipeline
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
@@ -1122,7 +1122,7 @@ applicable when all features are counts of categorical values.
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import NaiveBayes
+from systemml.mllearn import NaiveBayes
nb = NaiveBayes(sqlCtx, laplace=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = nb.fit(X_train, y_train)
@@ -1257,7 +1257,7 @@ SystemML Language Reference for details.
{% highlight python %}
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
-from SystemML.mllearn import NaiveBayes
+from systemml.mllearn import NaiveBayes
from sklearn import metrics
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/298a9e7b/algorithms-regression.md
----------------------------------------------------------------------
diff --git a/algorithms-regression.md b/algorithms-regression.md
index 6585b00..992862e 100644
--- a/algorithms-regression.md
+++ b/algorithms-regression.md
@@ -82,7 +82,7 @@ efficient when the number of features $m$ is relatively small
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
# C = 1/reg
lr = LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -124,7 +124,7 @@ y_test = lr.fit(df_train)
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
# C = 1/reg
lr = LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices
@@ -222,7 +222,7 @@ SystemML Language Reference for details.
{% highlight python %}
import numpy as np
from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
@@ -277,7 +277,7 @@ print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) -
{% highlight python %}
import numpy as np
from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/298a9e7b/beginners-guide-python.md
----------------------------------------------------------------------
diff --git a/beginners-guide-python.md b/beginners-guide-python.md
index 3b4aeed..f040212 100644
--- a/beginners-guide-python.md
+++ b/beginners-guide-python.md
@@ -72,7 +72,7 @@ brew install apache-spark
#### Step 1: Install SystemML Python package
```bash
-pip install SystemML
+pip install systemml
```
#### Step 2: Download SystemML Java binaries
@@ -81,14 +81,14 @@ SystemML Python package downloads the corresponding Java binaries (along with al
into the installed location. To find the location of the downloaded Java binaries, use the following command:
```bash
-python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'
+python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'
```
#### Step 3: (Optional but recommended) Set SYSTEMML_HOME environment variable
<div class="codetabs">
<div data-lang="OSX" markdown="1">
```bash
-SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'`
+SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'`
# If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively.
echo '' >> ~/.bashrc
echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
@@ -96,7 +96,7 @@ echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
</div>
<div data-lang="Linux" markdown="1">
```bash
-SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'`
+SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'`
# If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively.
echo '' >> ~/.bashrc
echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
@@ -128,7 +128,7 @@ pyspark --master local[*] --driver-class-path $SYSTEMML_HOME"/SystemML.jar"
To get started with SystemML, let's try few elementary matrix multiplication operations:
```python
-import SystemML as sml
+import systemml as sml
import numpy as np
sml.setSparkContext(sc)
m1 = sml.matrix(np.ones((3,3)) + 2)
@@ -152,7 +152,7 @@ model: $ \beta = solve(X^T X, X^T y) $. For simplicity, we will use direct-solve
```python
import numpy as np
from sklearn import datasets
-import SystemML as sml
+import systemml as sml
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
@@ -196,7 +196,7 @@ algorithm.
```python
import numpy as np
from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
@@ -230,7 +230,7 @@ algorithm on digits datasets.
```python
# Scikit-learn way
from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
@@ -245,15 +245,21 @@ logistic = LogisticRegression(sqlCtx)
print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
```
+Output:
+
+```bash
+LogisticRegression score: 0.922222
+```
+
### Passing PySpark DataFrame
To train the above algorithm on larger dataset, we can load the dataset into DataFrame and pass it to the `fit` method:
```python
from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.sql import SQLContext
-import SystemML as sml
+import systemml as sml
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
X_digits = digits.data
@@ -267,6 +273,12 @@ logistic = LogisticRegression(sqlCtx)
print('LogisticRegression score: %f' % logistic.fit(df_train).score(X_test, y_test))
```
+Output:
+
+```bash
+LogisticRegression score: 0.922222
+```
+
### MLPipeline interface
In the below example, we demonstrate how the same `LogisticRegression` class can allow SystemML to fit seamlessly into
@@ -275,7 +287,7 @@ large data pipelines.
```python
# MLPipeline way
from pyspark.ml import Pipeline
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
@@ -307,6 +319,19 @@ prediction = model.transform(test)
prediction.show()
```
+Output:
+
+```bash
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+|id| text| words| features| probability| ID|prediction|
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+|12| spark i j k|ArrayBuffer(spark...|(20,[5,6,7],[2.0,...|[0.99999999999975...|1.0| 1.0|
+|13| l m n|ArrayBuffer(l, m, n)|(20,[8,9,10],[1.0...|[1.37552128844736...|2.0| 2.0|
+|14|mapreduce spark|ArrayBuffer(mapre...|(20,[5,10],[1.0,1...|[0.99860290938153...|3.0| 1.0|
+|15| apache hadoop|ArrayBuffer(apach...|(20,[9,14],[1.0,1...|[5.41688748236143...|4.0| 2.0|
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+```
+
## Invoking DML/PyDML scripts using MLContext
The below example demonstrates how to invoke the algorithm [scripts/algorithms/MultiLogReg.dml](https://github.com/apache/incubator-systemml/blob/master/scripts/algorithms/MultiLogReg.dml)
@@ -315,7 +340,7 @@ using Python [MLContext API](https://apache.github.io/incubator-systemml/spark-m
```python
from sklearn import datasets, neighbors
from pyspark.sql import DataFrame, SQLContext
-import SystemML as sml
+import systemml as sml
import pandas as pd
import os
sqlCtx = SQLContext(sc)