You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/08/29 20:53:50 UTC

incubator-systemml git commit: [SYSTEMML-209] Added documentation for MLPipeline scala wrappers for MultiLogReg, SVM and Naive Bayes.

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 1be911cc5 -> 3877e3563


[SYSTEMML-209] Added documentation for MLPipeline scala wrappers for
MultiLogReg, SVM and Naive Bayes.

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/3877e356
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/3877e356
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/3877e356

Branch: refs/heads/master
Commit: 3877e3563007ebf383d8c18681ab3d379a93f698
Parents: 1be911c
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Mon Aug 29 13:47:16 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Mon Aug 29 13:48:42 2016 -0700

----------------------------------------------------------------------
 docs/algorithms-classification.md | 108 +++++++++++++++++++++++++++++++++
 docs/beginners-guide-python.md    |   7 +--
 2 files changed, 111 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3877e356/docs/algorithms-classification.md
----------------------------------------------------------------------
diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md
index 03c78d6..340267c 100644
--- a/docs/algorithms-classification.md
+++ b/docs/algorithms-classification.md
@@ -138,6 +138,14 @@ y_test = logistic.fit(X_train, y_train).predict(X_test)
 y_test = logistic.fit(df_train).transform(df_test)
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+import org.apache.sysml.api.ml.LogisticRegression
+val lr = new LogisticRegression("logReg", sc).setIcpt(0).setMaxOuterIter(100).setMaxInnerIter(0).setRegParam(0.000001).setTol(0.000001)
+val model = lr.fit(X_train_df)
+val prediction = model.transform(X_test_df)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f MultiLogReg.dml
                             -nvargs X=<file>
@@ -277,6 +285,38 @@ prediction = model.transform(test)
 prediction.show()
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.sysml.api.ml.LogisticRegression
+import org.apache.spark.ml.Pipeline
+val training = sqlContext.createDataFrame(Seq(
+    ("a b c d e spark", 1.0),
+    ("b d", 2.0),
+    ("spark f g h", 1.0),
+    ("hadoop mapreduce", 2.0),
+    ("b spark who", 1.0),
+    ("g d a y", 2.0),
+    ("spark fly", 1.0),
+    ("was mapreduce", 2.0),
+    ("e spark program", 1.0),
+    ("a e c l", 2.0),
+    ("spark compile", 1.0),
+    ("hadoop software", 2.0))).toDF("text", "label")
+val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
+val hashingTF = new HashingTF().setNumFeatures(20).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
+val lr = new LogisticRegression("logReg", sc)
+val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr))
+val model = pipeline.fit(training)
+val test = sqlContext.createDataFrame(Seq(
+    ("spark i j k", 1.0),
+    ("l m n", 2.0),
+    ("mapreduce spark", 1.0),
+    ("apache hadoop", 2.0))).toDF("text", "trueLabel")
+val prediction = model.transform(test)
+prediction.show()
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f MultiLogReg.dml
                             -nvargs X=/user/ml/X.mtx
@@ -467,6 +507,13 @@ y_test = svm.fit(X_train, y_train)
 y_test = svm.fit(df_train)
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+import org.apache.sysml.api.ml.SVM
+val svm = new SVM("svm", sc, isMultiClass=false).setIcpt(0).setMaxIter(100).setRegParam(0.000001).setTol(0.000001)
+val model = svm.fit(X_train_df)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f l2-svm.dml
                             -nvargs X=<file>
@@ -510,6 +557,11 @@ y_test = svm.predict(X_test)
 y_test = svm.transform(df_test)
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+val prediction = model.transform(X_test_df)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f l2-svm-predict.dml
                             -nvargs X=<file>
@@ -723,6 +775,13 @@ y_test = svm.fit(X_train, y_train)
 y_test = svm.fit(df_train)
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+import org.apache.sysml.api.ml.SVM
+val svm = new SVM("svm", sc, isMultiClass=true).setIcpt(0).setMaxIter(100).setRegParam(0.000001).setTol(0.000001)
+val model = svm.fit(X_train_df)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm.dml
                             -nvargs X=<file>
@@ -766,6 +825,11 @@ y_test = svm.predict(X_test)
 y_test = svm.transform(df_test)
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+val prediction = model.transform(X_test_df)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm-predict.dml
                             -nvargs X=<file>
@@ -900,6 +964,38 @@ prediction = model.transform(test)
 prediction.show()
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.sysml.api.ml.SVM
+import org.apache.spark.ml.Pipeline
+val training = sqlContext.createDataFrame(Seq(
+    ("a b c d e spark", 1.0),
+    ("b d", 2.0),
+    ("spark f g h", 1.0),
+    ("hadoop mapreduce", 2.0),
+    ("b spark who", 1.0),
+    ("g d a y", 2.0),
+    ("spark fly", 1.0),
+    ("was mapreduce", 2.0),
+    ("e spark program", 1.0),
+    ("a e c l", 2.0),
+    ("spark compile", 1.0),
+    ("hadoop software", 2.0))).toDF("text", "label")
+val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
+val hashingTF = new HashingTF().setNumFeatures(20).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
+val svm = new SVM("svm", sc, isMultiClass=true)
+val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, svm))
+val model = pipeline.fit(training)
+val test = sqlContext.createDataFrame(Seq(
+    ("spark i j k", 1.0),
+    ("l m n", 2.0),
+    ("mapreduce spark", 1.0),
+    ("apache hadoop", 2.0))).toDF("text", "trueLabel")
+val prediction = model.transform(test)
+prediction.show()
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f m-svm.dml
                             -nvargs X=/user/ml/X.mtx
@@ -1034,6 +1130,13 @@ y_test = nb.fit(X_train, y_train)
 y_test = nb.fit(df_train)
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+import org.apache.sysml.api.ml.NaiveBayes
+val nb = new NaiveBayes("naiveBayes", sc, isMultiClass=true).setLaplace(1.0)
+val model = nb.fit(X_train_df)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f naive-bayes.dml
                             -nvargs X=<file>
@@ -1073,6 +1176,11 @@ y_test = nb.predict(X_test)
 y_test = nb.transform(df_test)
 {% endhighlight %}
 </div>
+<div data-lang="Scala" markdown="1">
+{% highlight scala %}
+val prediction = model.transform(X_test_df)
+{% endhighlight %}
+</div>
 <div data-lang="Hadoop" markdown="1">
     hadoop jar SystemML.jar -f naive-bayes-predict.dml
                             -nvargs X=<file>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3877e356/docs/beginners-guide-python.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md
index 790ed43..b565656 100644
--- a/docs/beginners-guide-python.md
+++ b/docs/beginners-guide-python.md
@@ -309,11 +309,11 @@ prediction.show()
 
 ## Invoking DML/PyDML scripts using MLContext
 
-TODO: This is work in progress.
+The below example demonstrates how to invoke the algorithm [scripts/algorithms/MultiLogReg.dml](https://github.com/apache/incubator-systemml/blob/master/scripts/algorithms/MultiLogReg.dml)
+using Python [MLContext API](https://apache.github.io/incubator-systemml/spark-mlcontext-programming-guide).
 
 ```python
 from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
 from pyspark.sql import DataFrame, SQLContext
 import SystemML as sml
 import pandas as pd
@@ -328,7 +328,6 @@ X_df = sqlCtx.createDataFrame(pd.DataFrame(X_digits[:.9 * n_samples]))
 y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:.9 * n_samples]))
 ml = sml.MLContext(sc)
 script = os.path.join(os.environ['SYSTEMML_HOME'], 'scripts', 'algorithms', 'MultiLogReg.dml')
-script = sml.dml(script).input(X=X_df, Y_vec=y_df).out("B_out")
-# .input($X=' ', $Y=' ', $B=' ')
+script = sml.dml(script).input(X=X_df, Y_vec=y_df).input(**{"$X": ' ', "$Y": ' ', "$B": ' '}).out("B_out")
 beta = ml.execute(script).getNumPyArray('B_out')
 ```