You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/08/05 22:33:14 UTC

[1/2] incubator-systemml git commit: [SYSTEMML-849][SYSTEMML-457][SYSTEMML-458] Clean Up and Reorganize Documentation Targeted At Data Scientists

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 588bafac3 -> 77363c0c6


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/spark-mlcontext-programming-guide.md
----------------------------------------------------------------------
diff --git a/docs/spark-mlcontext-programming-guide.md b/docs/spark-mlcontext-programming-guide.md
index 2eaf1be..6c2d2af 100644
--- a/docs/spark-mlcontext-programming-guide.md
+++ b/docs/spark-mlcontext-programming-guide.md
@@ -108,7 +108,7 @@ ml.execute(helloScript)
 <div data-lang="Spark Shell" markdown="1">
 {% highlight scala %}
 scala> val helloScript = dml("print('hello world')")
-helloScript: org.apache.sysml.api.mlcontext.Script = 
+helloScript: org.apache.sysml.api.mlcontext.Script =
 Inputs:
 None
 
@@ -117,7 +117,7 @@ None
 
 scala> ml.execute(helloScript)
 hello world
-res0: org.apache.sysml.api.mlcontext.MLResults = 
+res0: org.apache.sysml.api.mlcontext.MLResults =
 None
 
 {% endhighlight %}
@@ -214,7 +214,7 @@ scala> val minMaxMean =
      | maxOut = max(Xin)
      | meanOut = mean(Xin)
      | """
-minMaxMean: String = 
+minMaxMean: String =
 "
 minOut = min(Xin)
 maxOut = max(Xin)
@@ -307,7 +307,7 @@ scala> val sums = """
      |   message = "s1 and s2 are equal"
      | }
      | """
-sums: String = 
+sums: String =
 "
 s1 = sum(m1);
 s2 = sum(m2);
@@ -323,7 +323,7 @@ if (s1 > s2) {
 scala> scala.tools.nsc.io.File("sums.dml").writeAll(sums)
 
 scala> val sumScript = dmlFromFile("sums.dml").in(Map("m1"-> rdd1, "m2"-> rdd2)).out("s1", "s2", "message")
-sumScript: org.apache.sysml.api.mlcontext.Script = 
+sumScript: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) m1: ParallelCollectionRDD[42] at parallelize at <console>:38
   [2] (RDD) m2: ParallelCollectionRDD[43] at parallelize at <console>:38
@@ -334,7 +334,7 @@ Outputs:
   [3] message
 
 scala> val sumResults = ml.execute(sumScript)
-sumResults: org.apache.sysml.api.mlcontext.MLResults = 
+sumResults: org.apache.sysml.api.mlcontext.MLResults =
   [1] (Double) s1: 10.0
   [2] (Double) s2: 26.0
   [3] (String) message: s2 is greater
@@ -378,7 +378,7 @@ scala> val rdd2Metadata = new MatrixMetadata(2, 2)
 rdd2Metadata: org.apache.sysml.api.mlcontext.MatrixMetadata = rows: 2, columns: 2, non-zeros: None, rows per block: None, columns per block: None
 
 scala> val sumScript = dmlFromFile("sums.dml").in(Seq(("m1", rdd1, rdd1Metadata), ("m2", rdd2, rdd2Metadata))).out("s1", "s2", "message")
-sumScript: org.apache.sysml.api.mlcontext.Script = 
+sumScript: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) m1: ParallelCollectionRDD[42] at parallelize at <console>:38
   [2] (RDD) m2: ParallelCollectionRDD[43] at parallelize at <console>:38
@@ -416,7 +416,7 @@ val (firstSum, secondSum, sumMessage) = ml.execute(sumScript).getTuple[Double, D
 <div data-lang="Spark Shell" markdown="1">
 {% highlight scala %}
 scala> val sumScript = dmlFromFile("sums.dml").in("m1", rdd1, rdd1Metadata).in("m2", rdd2, rdd2Metadata).out("s1").out("s2").out("message")
-sumScript: org.apache.sysml.api.mlcontext.Script = 
+sumScript: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) m1: ParallelCollectionRDD[42] at parallelize at <console>:38
   [2] (RDD) m2: ParallelCollectionRDD[43] at parallelize at <console>:38
@@ -445,7 +445,7 @@ Let's look at an example of reading a matrix out of SystemML. We'll create a DML
 in which we create a 2x2 matrix `m`. We'll set the variable `n` to be the sum of the cells in the matrix.
 
 We create a script object using String `s`, and we set `m` and `n` as the outputs. We execute the script, and in
-the results we see we have Matrix `m` and Double `n`. The `n` output variable has a value of `110.0`. 
+the results we see we have Matrix `m` and Double `n`. The `n` output variable has a value of `110.0`.
 
 We get Matrix `m` and Double `n` as a Tuple of values `x` and `y`. We then convert Matrix `m` to an
 RDD of IJV values, an RDD of CSV values, a DataFrame, and a two-dimensional Double Array, and we display
@@ -478,14 +478,14 @@ scala> val s =
      | m = matrix("11 22 33 44", rows=2, cols=2)
      | n = sum(m)
      | """
-s: String = 
+s: String =
 "
 m = matrix("11 22 33 44", rows=2, cols=2)
 n = sum(m)
 "
 
 scala> val scr = dml(s).out("m", "n");
-scr: org.apache.sysml.api.mlcontext.Script = 
+scr: org.apache.sysml.api.mlcontext.Script =
 Inputs:
 None
 
@@ -495,7 +495,7 @@ Outputs:
 
 
 scala> val res = ml.execute(scr)
-res: org.apache.sysml.api.mlcontext.MLResults = 
+res: org.apache.sysml.api.mlcontext.MLResults =
   [1] (Matrix) m: Matrix: scratch_space//_p12059_9.31.117.12//_t0/temp26_14, [2 x 2, nnz=4, blocks (1000 x 1000)], binaryblock, dirty
   [2] (Double) n: 110.0
 
@@ -588,7 +588,7 @@ scala> val scriptUrl = "https://raw.githubusercontent.com/apache/incubator-syste
 scriptUrl: String = https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml
 
 scala> val uni = dmlFromUrl(scriptUrl).in("A", habermanRDD, habermanMetadata).in("K", typesRDD, typesMetadata).in("$CONSOLE_OUTPUT", true)
-uni: org.apache.sysml.api.mlcontext.Script = 
+uni: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) A: ParallelCollectionRDD[159] at parallelize at <console>:43
   [2] (RDD) K: ParallelCollectionRDD[160] at parallelize at <console>:39
@@ -653,7 +653,7 @@ Feature [4]: Categorical (Nominal)
  (15) Num of categories   | 2
  (16) Mode                | 1
  (17) Num of modes        | 1
-res23: org.apache.sysml.api.mlcontext.MLResults = 
+res23: org.apache.sysml.api.mlcontext.MLResults =
 None
 
 {% endhighlight %}
@@ -723,7 +723,7 @@ baseStats.asRDDStringIJV.collect.slice(0,9).foreach(println)
 <div data-lang="Spark Shell" markdown="1">
 {% highlight scala %}
 scala> val uni = dmlFromUrl(scriptUrl).in("A", habermanRDD, habermanMetadata).in("K", typesRDD, typesMetadata).out("baseStats")
-uni: org.apache.sysml.api.mlcontext.Script = 
+uni: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) A: ParallelCollectionRDD[159] at parallelize at <console>:43
   [2] (RDD) K: ParallelCollectionRDD[160] at parallelize at <console>:39
@@ -783,7 +783,7 @@ scala> val minMaxMean =
      | maxOut = max(Xin)
      | meanOut = mean(Xin)
      | """
-minMaxMean: String = 
+minMaxMean: String =
 "
 minOut = min(Xin)
 maxOut = max(Xin)
@@ -937,7 +937,7 @@ scala> val minMaxMean =
      | maxOut = max(Xin)
      | meanOut = mean(Xin)
      | """
-minMaxMean: String = 
+minMaxMean: String =
 "
 minOut = min(Xin)
 maxOut = max(Xin)
@@ -1023,7 +1023,7 @@ scala> val minMaxMean =
      | maxOut = max(Xin)
      | meanOut = mean(Xin)
      | """
-minMaxMean: String = 
+minMaxMean: String =
 "
 minOut = min(Xin)
 maxOut = max(Xin)
@@ -1186,7 +1186,7 @@ scala> class MyScriptExecutor extends org.apache.sysml.api.mlcontext.ScriptExecu
 defined class MyScriptExecutor
 
 scala> val helloScript = dml("print('hello world')")
-helloScript: org.apache.sysml.api.mlcontext.Script = 
+helloScript: org.apache.sysml.api.mlcontext.Script =
 Inputs:
 None
 
@@ -1197,7 +1197,7 @@ scala> ml.execute(helloScript, new MyScriptExecutor)
 Parsing script
 Validating script
 hello world
-res63: org.apache.sysml.api.mlcontext.MLResults = 
+res63: org.apache.sysml.api.mlcontext.MLResults =
 None
 
 {% endhighlight %}
@@ -1242,7 +1242,7 @@ scala> val rddCSV = sc.parallelize(Array("1.0,2.0", "3.0,4.0"))
 rddCSV: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[190] at parallelize at <console>:38
 
 scala> val sumAndMean = dml("sum = sum(m); mean = mean(m)").in("m", rddCSV).out("sum", "mean")
-sumAndMean: org.apache.sysml.api.mlcontext.Script = 
+sumAndMean: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) m: ParallelCollectionRDD[190] at parallelize at <console>:38
 
@@ -1251,7 +1251,7 @@ Outputs:
   [2] mean
 
 scala> ml.execute(sumAndMean)
-res20: org.apache.sysml.api.mlcontext.MLResults = 
+res20: org.apache.sysml.api.mlcontext.MLResults =
   [1] (Double) sum: 10.0
   [2] (Double) mean: 2.5
 
@@ -1291,7 +1291,7 @@ scala> val mm3x3 = new MatrixMetadata(MatrixFormat.IJV, 3, 3)
 mm3x3: org.apache.sysml.api.mlcontext.MatrixMetadata = rows: 3, columns: 3, non-zeros: None, rows per block: None, columns per block: None
 
 scala> val sumAndMean = dml("sum = sum(m); mean = mean(m)").in("m", rddIJV, mm3x3).out("sum", "mean")
-sumAndMean: org.apache.sysml.api.mlcontext.Script = 
+sumAndMean: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) m: ParallelCollectionRDD[202] at parallelize at <console>:38
 
@@ -1300,7 +1300,7 @@ Outputs:
   [2] mean
 
 scala> ml.execute(sumAndMean)
-res21: org.apache.sysml.api.mlcontext.MLResults = 
+res21: org.apache.sysml.api.mlcontext.MLResults =
   [1] (Double) sum: 10.0
   [2] (Double) mean: 1.1111111111111112
 
@@ -1333,7 +1333,7 @@ scala> val mm4x4 = new MatrixMetadata(MatrixFormat.IJV, 4, 4)
 mm4x4: org.apache.sysml.api.mlcontext.MatrixMetadata = rows: 4, columns: 4, non-zeros: None, rows per block: None, columns per block: None
 
 scala> val sumAndMean = dml("sum = sum(m); mean = mean(m)").in("m", rddIJV, mm4x4).out("sum", "mean")
-sumAndMean: org.apache.sysml.api.mlcontext.Script = 
+sumAndMean: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) m: ParallelCollectionRDD[210] at parallelize at <console>:38
 
@@ -1342,7 +1342,7 @@ Outputs:
   [2] mean
 
 scala> ml.execute(sumAndMean)
-res22: org.apache.sysml.api.mlcontext.MLResults = 
+res22: org.apache.sysml.api.mlcontext.MLResults =
   [1] (Double) sum: 10.0
   [2] (Double) mean: 0.625
 
@@ -1445,7 +1445,7 @@ scala> val rddCSV = sc.parallelize(Array("1.0,2.0", "3.0,4.0"))
 rddCSV: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[341] at parallelize at <console>:53
 
 scala> val add = dml("y = x + 1").in("x", rddCSV).out("y")
-add: org.apache.sysml.api.mlcontext.Script = 
+add: org.apache.sysml.api.mlcontext.Script =
 Inputs:
   [1] (RDD) x: ParallelCollectionRDD[341] at parallelize at <console>:53
 
@@ -2100,3 +2100,8 @@ plt.title('PNMF Training Loss')
 {% endhighlight %}
 
 ![Jupyter Loss Graph](img/spark-mlcontext-programming-guide/jupyter_loss_graph.png "Jupyter Loss Graph")
+
+# Recommended Spark Configuration Settings
+
+For best performance, we recommend setting the following flags when running SystemML with Spark:
+`--conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128`.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/standalone-guide.md
----------------------------------------------------------------------
diff --git a/docs/standalone-guide.md b/docs/standalone-guide.md
new file mode 100644
index 0000000..38b6497
--- /dev/null
+++ b/docs/standalone-guide.md
@@ -0,0 +1,582 @@
+---
+layout: global
+title: SystemML Standalone Guide
+description: SystemML Standalone Guide
+displayTitle: SystemML Standalone Guide
+---
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+<br/>
+
+This tutorial provides a quick introduction to using SystemML by
+running existing SystemML algorithms in standalone mode.
+
+
+# What is SystemML
+
+SystemML enables large-scale machine learning (ML) via a high-level declarative
+language with R-like syntax called [DML](dml-language-reference.html) and
+Python-like syntax called PyDML. DML and PyDML allow data scientists to
+express their ML algorithms with full flexibility but without the need to fine-tune
+distributed runtime execution plans and system configurations.
+These ML programs are dynamically compiled and optimized based on data
+and cluster characteristics using rule-based and cost-based optimization techniques.
+The compiler automatically generates hybrid runtime execution plans ranging
+from in-memory, single node execution to distributed computation for Hadoop
+or Spark Batch execution.
+SystemML features a suite of algorithms for Descriptive Statistics, Classification,
+Clustering, Regression, Matrix Factorization, and Survival Analysis. Detailed descriptions of these
+algorithms can be found in the [Algorithms Reference](algorithms-reference.html).
+
+# Download SystemML
+
+Apache incubator releases of SystemML are available from the [downloads](http://systemml.apache.org/download.html) page.
+
+The SystemML project is available on GitHub at [https://github.com/apache/incubator-systemml](https://github.com/apache/incubator-systemml).
+SystemML can be downloaded from GitHub and built with Maven. Instructions to build and
+test SystemML can be found in the [SystemML GitHub README](https://github.com/apache/incubator-systemml).
+
+# Standalone vs Distributed Execution Mode
+
+SystemML's standalone mode is designed to allow data scientists to rapidly prototype algorithms
+on a single machine. In standalone mode, all operations occur on a single node in a non-Hadoop
+environment. Standalone mode is not appropriate for large datasets.
+
+For large-scale production environments, SystemML algorithm execution can be
+distributed across multi-node clusters using [Apache Hadoop](https://hadoop.apache.org/)
+or [Apache Spark](http://spark.apache.org/).
+We will make use of standalone mode throughout this tutorial.
+
+# Choosing Test Data
+
+In this tutorial we will use the [Haberman's Survival Data Set](http://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival)
+which can be downloaded in CSV format from the [Center for Machine Learning and Intelligent Systems](http://cml.ics.uci.edu/)
+
+    $ wget -P data/ http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data
+
+The [Haberman Data Set](http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.names)
+has 306 instances and 4 attributes (including the class attribute):
+
+ 1. Age of patient at time of operation (numerical)
+ 2. Patient's year of operation (year - 1900, numerical)
+ 3. Number of positive axillary nodes detected (numerical)
+ 4. Survival status (class attribute)
+   * `1` = the patient survived 5 years or longer
+   * `2` = the patient died within 5 year
+
+
+We will need to create a metadata file (MTD) which stores metadata information
+about the content of the data file. The name of the MTD file associated with the
+data file `<filename>` must be `<filename>.mtd`.
+
+    $ echo '{"rows": 306, "cols": 4, "format": "csv"}' > data/haberman.data.mtd
+
+<br/>
+
+# Example 1 - Univariate Statistics
+
+Let's start with a simple example, computing certain [univariate statistics](algorithms-descriptive-statistics.html#univariate-statistics)
+for each feature column using the algorithm `Univar-Stats.dml` which requires 3
+[arguments](algorithms-descriptive-statistics.html#arguments):
+
+* `X`:  location of the input data file to analyze
+* `TYPES`:  location of the file that contains the feature column types encoded by integer numbers: `1` = scale, `2` = nominal, `3` = ordinal
+* `STATS`:  location where the output matrix of computed statistics is to be stored
+
+We need to create a file `types.csv` that describes the type of each column in
+the data along with its metadata file `types.csv.mtd`.
+
+    $ echo '1,1,1,2' > data/types.csv
+    $ echo '{"rows": 1, "cols": 4, "format": "csv"}' > data/types.csv.mtd
+
+
+To run the `Univar-Stats.dml` algorithm, issue the following command (we set the optional argument `CONSOLE_OUTPUT` to `TRUE` to print the statistics to the console):
+
+    $ ./runStandaloneSystemML.sh scripts/algorithms/Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE
+
+    [...]
+    -------------------------------------------------
+    Feature [1]: Scale
+     (01) Minimum             | 30.0
+     (02) Maximum             | 83.0
+     (03) Range               | 53.0
+     (04) Mean                | 52.45751633986928
+     (05) Variance            | 116.71458266366658
+     (06) Std deviation       | 10.803452349303281
+     (07) Std err of mean     | 0.6175922641866753
+     (08) Coeff of variation  | 0.20594669940735139
+     (09) Skewness            | 0.1450718616532357
+     (10) Kurtosis            | -0.6150152487211726
+     (11) Std err of skewness | 0.13934809593495995
+     (12) Std err of kurtosis | 0.277810485320835
+     (13) Median              | 52.0
+     (14) Interquartile mean  | 52.16013071895425
+    -------------------------------------------------
+    Feature [2]: Scale
+     (01) Minimum             | 58.0
+     (02) Maximum             | 69.0
+     (03) Range               | 11.0
+     (04) Mean                | 62.85294117647059
+     (05) Variance            | 10.558630665380907
+     (06) Std deviation       | 3.2494046632238507
+     (07) Std err of mean     | 0.18575610076612029
+     (08) Coeff of variation  | 0.051698529971741194
+     (09) Skewness            | 0.07798443581479181
+     (10) Kurtosis            | -1.1324380182967442
+     (11) Std err of skewness | 0.13934809593495995
+     (12) Std err of kurtosis | 0.277810485320835
+     (13) Median              | 63.0
+     (14) Interquartile mean  | 62.80392156862745
+    -------------------------------------------------
+    Feature [3]: Scale
+     (01) Minimum             | 0.0
+     (02) Maximum             | 52.0
+     (03) Range               | 52.0
+     (04) Mean                | 4.026143790849673
+     (05) Variance            | 51.691117539912135
+     (06) Std deviation       | 7.189653506248555
+     (07) Std err of mean     | 0.41100513466216837
+     (08) Coeff of variation  | 1.7857418611299172
+     (09) Skewness            | 2.954633471088322
+     (10) Kurtosis            | 11.425776549251449
+     (11) Std err of skewness | 0.13934809593495995
+     (12) Std err of kurtosis | 0.277810485320835
+     (13) Median              | 1.0
+     (14) Interquartile mean  | 1.2483660130718954
+    -------------------------------------------------
+    Feature [4]: Categorical (Nominal)
+     (15) Num of categories   | 2
+     (16) Mode                | 1
+     (17) Num of modes        | 1
+
+
+The `Univar-Stats.dml` script writes the computed statistics to the `univarOut.mtx` file. The matrix has one row per univariate statistic and one column per input feature. The first column gives the number of the statistic
+(see above table), the second column gives the number of the feature column in
+the input data, and the third column gives the value of the univariate statistic.
+
+    1 1 30.0
+    1 2 58.0
+    2 1 83.0
+    2 2 69.0
+    2 3 52.0
+    3 1 53.0
+    3 2 11.0
+    3 3 52.0
+    4 1 52.45751633986928
+    4 2 62.85294117647059
+    4 3 4.026143790849673
+    5 1 116.71458266366658
+    5 2 10.558630665380907
+    5 3 51.691117539912135
+    6 1 10.803452349303281
+    6 2 3.2494046632238507
+    6 3 7.189653506248555
+    7 1 0.6175922641866753
+    7 2 0.18575610076612029
+    7 3 0.41100513466216837
+    8 1 0.20594669940735139
+    8 2 0.051698529971741194
+    8 3 1.7857418611299172
+    9 1 0.1450718616532357
+    9 2 0.07798443581479181
+    9 3 2.954633471088322
+    10 1 -0.6150152487211726
+    10 2 -1.1324380182967442
+    10 3 11.425776549251449
+    11 1 0.13934809593495995
+    11 2 0.13934809593495995
+    11 3 0.13934809593495995
+    12 1 0.277810485320835
+    12 2 0.277810485320835
+    12 3 0.277810485320835
+    13 1 52.0
+    13 2 63.0
+    13 3 1.0
+    14 1 52.16013071895425
+    14 2 62.80392156862745
+    14 3 1.2483660130718954
+    15 4 2.0
+    16 4 1.0
+    17 4 1.0
+
+
+<br/>
+<br/>
+
+# Example 2 - Binary-class Support Vector Machines
+
+Let's take the same `haberman.data` to explore the
+[binary-class support vector machines](algorithms-classification.html#binary-class-support-vector-machines) algorithm `l2-svm.dml`.
+This example also illustrates how to use of the sampling algorithm `sample.dml`
+and the data split algorithm `spliXY.dml`.
+
+## Sampling the Test Data
+
+First we need to use the `sample.dml` algorithm to separate the input into one
+training data set and one data set for model prediction.
+
+Parameters:
+
+ * `X`       : (input)  input data set: filename of input data set
+ * `sv`      : (input)  sampling vector: filename of 1-column vector w/ percentages. sum(sv) must be 1.
+ * `O`       : (output) folder name w/ samples generated
+ * `ofmt`    : (output) format of O: "csv", "binary" (default)
+
+
+We will create the file `perc.csv` and `perc.csv.mtd` to define the sampling vector with a sampling rate of
+50% to generate 2 data sets:
+
+    $ printf "0.5\n0.5" > data/perc.csv
+    $ echo '{"rows": 2, "cols": 1, "format": "csv"}' > data/perc.csv.mtd
+
+Let's run the sampling algorithm to create the two data samples:
+
+    $ ./runStandaloneSystemML.sh scripts/utils/sample.dml -nvargs X=data/haberman.data sv=data/perc.csv O=data/haberman.part ofmt="csv"
+
+
+## Splitting Labels from Features
+
+Next we use the `splitXY.dml` algorithm to separate the feature columns from
+the label column(s).
+
+Parameters:
+
+ * `X`       : (input)  filename of data matrix
+ * `y`       : (input)  colIndex: starting index is 1
+ * `OX`      : (output) filename of output matrix with all columns except y
+ * `OY`      : (output) filename of output matrix with y column
+ * `ofmt`    : (output) format of OX and OY output matrix: "csv", "binary" (default)
+
+We specify `y=4` as the 4th column contains the labels to be predicted and run
+the `splitXY.dml` algorithm on our training and test data sets.
+
+    $ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/1 y=4 OX=data/haberman.train.data.csv OY=data/haberman.train.labels.csv ofmt="csv"
+
+    $ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/2 y=4 OX=data/haberman.test.data.csv  OY=data/haberman.test.labels.csv  ofmt="csv"
+
+## Training and Testing the Model
+
+Now we need to train our model using the `l2-svm.dml` algorithm.
+
+[Parameters](algorithms-classification.html#arguments-1):
+
+ * `X`         : (input)  filename of training data features
+ * `Y`         : (input)  filename of training data labels
+ * `model`     : (output) filename of model that contains the learnt weights
+ * `fmt`       : (output) format of model: "csv", "text" (sparse-matrix)
+ * `Log`       : (output) log file for metrics and progress while training
+ * `confusion` : (output) filename of confusion matrix computed using a held-out test set (optional)
+
+The `l2-svm.dml` algorithm is used on our training data sample to train the model.
+
+    $ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm.dml -nvargs X=data/haberman.train.data.csv Y=data/haberman.train.labels.csv model=data/l2-svm-model.csv fmt="csv" Log=data/l2-svm-log.csv
+
+The `l2-svm-predict.dml` algorithm is used on our test data sample to predict the labels based on the trained model.
+
+    $ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm-predict.dml -nvargs X=data/haberman.test.data.csv Y=data/haberman.test.labels.csv model=data/l2-svm-model.csv fmt="csv" confusion=data/l2-svm-confusion.csv
+
+The console output should show the accuracy of the trained model in percent, i.e.:
+
+    15/09/01 01:32:51 INFO api.DMLScript: BEGIN DML run 09/01/2015 01:32:51
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating localtmpdir with value /tmp/systemml
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating scratch with value scratch_space
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating optlevel with value 2
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating numreducers with value 10
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating jvmreuse with value false
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating defaultblocksize with value 1000
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.appmaster with value false
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.appmaster.mem with value 2048
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.mapreduce.mem with value 2048
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.app.queue with value default
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating cp.parallel.matrixmult with value true
+    15/09/01 01:32:51 INFO conf.DMLConfig: Updating cp.parallel.textio with value true
+    Accuracy (%): 74.14965986394557
+    15/09/01 01:32:52 INFO api.DMLScript: SystemML Statistics:
+    Total execution time:		0.130 sec.
+    Number of executed MR Jobs:	0.
+
+The generated file `l2-svm-confusion.csv` should contain the following confusion matrix of this form:
+
+    |0   1.0 2.0|
+    |1.0 t1  t2 |
+    |2.0 t3  t4 |
+
+ * The model correctly predicted label 1 `t1` times
+ * The model incorrectly predicted label 1 as opposed to label 2 `t2` times
+ * The model incorrectly predicted label 2 as opposed to label 1 `t3` times
+ * The model correctly predicted label 2 `t4` times.
+
+If the confusion matrix looks like this ...
+
+    0,1.0,2.0
+    1.0,107.0,38.0
+    2.0,0.0,2.0
+
+... then the accuracy of the model is (t1+t4)/(t1+t2+t3+t4) = (107+2)/107+38+0+2) = 0.741496599
+
+<br/>
+
+Refer to the [Algorithms Reference](algorithms-reference.html) for more details.
+
+<br/>
+
+# Example 3 - Linear Regression
+
+For this example, we'll use a standalone wrapper executable, `bin/systemml`, that is available to
+be run directly within the project's source directory when built locally.
+
+After you build SystemML from source (`mvn clean package`), the standalone mode can be executed
+either on Linux or OS X using the `./bin/systemml` script, or on Windows using the
+`.\bin\systemml.bat` batch file.
+
+If you run from the script from the project root folder `./` or from the `./bin` folder, then the
+output files from running SystemML will be created inside the `./temp` folder to keep them separate
+from the SystemML source files managed by Git. The output files for this example will be created
+under the `./temp` folder.
+
+The runtime behavior and logging behavior of SystemML can be customized by editing the files
+`./conf/SystemML-config.xml` and `./conf/log4j.properties`. Both files will be created from their
+corresponding `*.template` files during the first execution of the SystemML executable script.
+
+When invoking the `./bin/systemml` or `.\bin\systemml.bat` with any of the prepackaged DML scripts
+you can omit the relative path to the DML script file. The following two commands are equivalent:
+
+    ./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
+
+    ./bin/systemml genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
+
+In this guide we invoke the command with the relative folder to make it easier to look up the source
+of the DML scripts.
+
+## Linear Regression Example
+
+As an example of the capabilities and power of SystemML and DML, let's consider the Linear Regression algorithm.
+We require sets of data to train and test our model. To obtain this data, we can either use real data or
+generate data for our algorithm. The
+[UCI Machine Learning Repository Datasets](https://archive.ics.uci.edu/ml/datasets.html) is one location for real data.
+Use of real data typically involves some degree of data wrangling. In the following example, we will use SystemML to
+generate random data to train and test our model.
+
+This example consists of the following parts:
+
+  * [Run DML Script to Generate Random Data](#run-dml-script-to-generate-random-data)
+  * [Divide Generated Data into Two Sample Groups](#divide-generated-data-into-two-sample-groups)
+  * [Split Label Column from First Sample](#split-label-column-from-first-sample)
+  * [Split Label Column from Second Sample](#split-label-column-from-second-sample)
+  * [Train Model on First Sample](#train-model-on-first-sample)
+  * [Test Model on Second Sample](#test-model-on-second-sample)
+
+SystemML is distributed in several packages, including a standalone package. We'll operate in Standalone mode in this
+example.
+
+<a name="run-dml-script-to-generate-random-data" />
+
+### Run DML Script to Generate Random Data
+
+We can execute the `genLinearRegressionData.dml` script in Standalone mode using either the `systemml` or `systemml.bat`
+file.
+In this example, we'll generate a matrix of 1000 rows of 50 columns of test data, with sparsity 0.7. In addition to
+this, a 51<sup>st</sup> column consisting of labels will
+be appended to the matrix.
+
+    ./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
+
+This generates the following files inside the `./temp` folder:
+
+    linRegData.csv      # 1000 rows of 51 columns of doubles (50 data columns and 1 label column), csv format
+    linRegData.csv.mtd  # Metadata file
+    perc.csv            # Used to generate two subsets of the data (for training and testing)
+    perc.csv.mtd        # Metadata file
+    scratch_space       # SystemML scratch_space directory
+
+<a name="divide-generated-data-into-two-sample-groups" />
+
+### Divide Generated Data into Two Sample Groups
+
+Next, we'll create two subsets of the generated data, each of size ~50%. We can accomplish this using the `sample.dml`
+script with the `perc.csv` file created in the previous step:
+
+    0.5
+    0.5
+
+
+The `sample.dml` script will randomly sample rows from the `linRegData.csv` file and place them into 2 files based
+on the percentages specified in `perc.csv`. This will create two sample groups of roughly 50 percent each.
+
+    ./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv
+
+
+This script creates two partitions of the original data and places them in a `linRegDataParts` folder. The files created
+are as follows:
+
+    linRegDataParts/1       # first partition of data, ~50% of rows of linRegData.csv, csv format
+    linRegDataParts/1.mtd   # metadata
+    linRegDataParts/2       # second partition of data, ~50% of rows of linRegData.csv, csv format
+    linRegDataParts/2.mtd   # metadata
+
+
+The `1` file contains the first partition of data, and the `2` file contains the second partition of data.
+An associated metadata file describes
+the nature of each partition of data. If we open `1` and `2` and look at the number of rows, we can see that typically
+the partitions are not exactly 50% but instead are close to 50%. However, we find that the total number of rows in the
+original data file equals the sum of the number of rows in `1` and `2`.
+
+
+<a name="split-label-column-from-first-sample" />
+
+### Split Label Column from First Sample
+
+The next task is to split the label column from the first sample. We can do this using the `splitXY.dml` script.
+
+    ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv
+
+This splits column 51, the label column, off from the data. When done, the following files have been created.
+
+    linRegData.train.data.csv        # training data of 50 columns, csv format
+    linRegData.train.data.csv.mtd    # metadata
+    linRegData.train.labels.csv      # training labels of 1 column, csv format
+    linRegData.train.labels.csv.mtd  # metadata
+
+
+<a name="split-label-column-from-second-sample" />
+
+### Split Label Column from Second Sample
+
+We also need to split the label column from the second sample.
+
+    ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv
+
+This splits column 51 off the data, resulting in the following files:
+
+    linRegData.test.data.csv        # test data of 50 columns, csv format
+    linRegData.test.data.csv.mtd    # metadata
+    linRegData.test.labels.csv      # test labels of 1 column, csv format
+    linRegData.test.labels.csv.mtd  # metadata
+
+
+<a name="train-model-on-first-sample" />
+
+### Train Model on First Sample
+
+Now, we can train our model based on the first sample. To do this, we utilize the `LinearRegDS.dml` (Linear Regression
+Direct Solve) script. Note that SystemML also includes a `LinearRegCG.dml` (Linear Regression Conjugate Gradient)
+algorithm for situations where the number of features is large.
+
+    ./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
+
+This will generate the following files:
+
+    betas.csv      # betas, 50 rows of 1 column, csv format
+    betas.csv.mtd  # metadata
+
+The LinearRegDS.dml script generates statistics to standard output similar to the following.
+
+	BEGIN LINEAR REGRESSION SCRIPT
+	Reading X and Y...
+	Calling the Direct Solver...
+	Computing the statistics...
+	AVG_TOT_Y,-2.160284487670675
+	STDEV_TOT_Y,66.86434576808432
+	AVG_RES_Y,-3.3127468704080085E-10
+	STDEV_RES_Y,1.7231785003947183E-8
+	DISPERSION,2.963950542926297E-16
+	PLAIN_R2,1.0
+	ADJUSTED_R2,1.0
+	PLAIN_R2_NOBIAS,1.0
+	ADJUSTED_R2_NOBIAS,1.0
+	PLAIN_R2_VS_0,1.0
+	ADJUSTED_R2_VS_0,1.0
+	Writing the output matrix...
+	END LINEAR REGRESSION SCRIPT
+
+Now that we have our `betas.csv`, we can test our model with our second set of data.
+
+
+<a name="test-model-on-second-sample" />
+
+### Test Model on Second Sample
+
+To test our model on the second sample, we can use the `GLM-predict.dml` script. This script can be used for both
+prediction and scoring. Here, we're using it for scoring since we include the `Y` named argument. Our `betas.csv`
+file is specified as the `B` named argument.
+
+    ./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
+
+This generates statistics similar to the following to standard output.
+
+	LOGLHOOD_Z,,FALSE,NaN
+	LOGLHOOD_Z_PVAL,,FALSE,NaN
+	PEARSON_X2,,FALSE,1.895530994504798E-13
+	PEARSON_X2_BY_DF,,FALSE,4.202951207327712E-16
+	PEARSON_X2_PVAL,,FALSE,1.0
+	DEVIANCE_G2,,FALSE,0.0
+	DEVIANCE_G2_BY_DF,,FALSE,0.0
+	DEVIANCE_G2_PVAL,,FALSE,1.0
+	LOGLHOOD_Z,,TRUE,NaN
+	LOGLHOOD_Z_PVAL,,TRUE,NaN
+	PEARSON_X2,,TRUE,1.895530994504798E-13
+	PEARSON_X2_BY_DF,,TRUE,4.202951207327712E-16
+	PEARSON_X2_PVAL,,TRUE,1.0
+	DEVIANCE_G2,,TRUE,0.0
+	DEVIANCE_G2_BY_DF,,TRUE,0.0
+	DEVIANCE_G2_PVAL,,TRUE,1.0
+	AVG_TOT_Y,1,,1.0069397725436522
+	STDEV_TOT_Y,1,,68.29092137526905
+	AVG_RES_Y,1,,-4.1450397073455047E-10
+	STDEV_RES_Y,1,,2.0519206226041048E-8
+	PRED_STDEV_RES,1,TRUE,1.0
+	PLAIN_R2,1,,1.0
+	ADJUSTED_R2,1,,1.0
+	PLAIN_R2_NOBIAS,1,,1.0
+	ADJUSTED_R2_NOBIAS,1,,1.0
+
+
+We see that the STDEV_RES_Y value of the testing phase is of similar magnitude
+to the value obtained from the model training phase.
+
+For convenience, we can encapsulate our DML invocations in a single script:
+
+	#!/bin/bash
+
+	./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
+
+	./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv
+
+	./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv
+
+	./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv
+
+	./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
+
+	./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
+
+
+# Troubleshooting
+
+If you encounter a `"java.lang.OutOfMemoryError"` you can edit the invocation
+script (`runStandaloneSystemML.sh` or `runStandaloneSystemML.bat`) to increase
+the memory available to the JVM, i.e:
+
+    java -Xmx16g -Xms4g -Xmn1g -cp ${CLASSPATH} org.apache.sysml.api.DMLScript \
+         -f ${SCRIPT_FILE} -exec singlenode -config=SystemML-config.xml \
+         $@


[2/2] incubator-systemml git commit: [SYSTEMML-849][SYSTEMML-457][SYSTEMML-458] Clean Up and Reorganize Documentation Targeted At Data Scientists

Posted by du...@apache.org.
[SYSTEMML-849][SYSTEMML-457][SYSTEMML-458] Clean Up and Reorganize Documentation Targeted At Data Scientists

This update cleans up and reorganizes a lot of the existing
documentation. The goal here is to work towards cleaning up our external
message and targeting specific types of users in order to increase ease
of adoption.

My vision is that we target data scientists using Spark first and
foremost. Without this focus in our documentation, the project is
seemingly too confusing, and will deter this key user demographic from
adoption. Once these users are onboard, engine developers will follow.

This PR is a first effort towards this goal, and provides a nicer,
cleaned-up version of the docs. We should collectively work to improve
these, with clear separation between data scientists, systems
engineers/researchers, etc.

Closes #203.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/77363c0c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/77363c0c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/77363c0c

Branch: refs/heads/master
Commit: 77363c0c67b131cbf937b023a353765af3c6e3bb
Parents: 588bafa
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Fri Aug 5 15:31:58 2016 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Fri Aug 5 15:31:58 2016 -0700

----------------------------------------------------------------------
 README.md                                 | 402 ++---------------
 docs/Gemfile                              |   3 -
 docs/_layouts/global.html                 |   2 +-
 docs/developer-tools-systemml.md          |  28 +-
 docs/dml-language-reference.md            | 178 +-------
 docs/engine-dev-guide.md                  |  56 +++
 docs/hadoop-batch-mode.md                 |  71 ++-
 docs/index.md                             |  53 ++-
 docs/quick-start-guide.md                 | 399 -----------------
 docs/release-process.md                   |  42 +-
 docs/spark-batch-mode.md                  |  84 ++++
 docs/spark-mlcontext-programming-guide.md |  59 +--
 docs/standalone-guide.md                  | 582 +++++++++++++++++++++++++
 13 files changed, 910 insertions(+), 1049 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 31bed64..2c73d6e 100644
--- a/README.md
+++ b/README.md
@@ -33,11 +33,11 @@ SystemML is a flexible, scalable machine learning system.
 SystemML's distinguishing characteristics are:
 
   1. **Algorithm customizability via R-like and Python-like languages**.
-  2. **Multiple execution modes**, including Standalone, Spark Batch, Spark MLContext, Hadoop Batch, and JMLC.
+  2. **Multiple execution modes**, including Spark MLContext API, Spark Batch, Hadoop Batch, Standalone, and JMLC.
   3. **Automatic optimization** based on data and cluster characteristics to ensure both efficiency and scalability.
 
 
-### Algorithm Customizability
+## Algorithm Customizability
 
 ML algorithms in SystemML are specified in a high-level, declarative machine learning (DML) language.
 Algorithms can be expressed in either an R-like syntax or a Python-like syntax. DML includes
@@ -49,400 +49,92 @@ analytics and (2) data independence from the underlying input formats and
 physical data representations.
 
 
-### Multiple Execution Modes
+## Multiple Execution Modes
 
 SystemML computations can be executed in a variety of different modes. To begin with, SystemML
 can be operated in Standalone mode on a single machine, allowing data scientists to develop
 algorithms locally without need of a distributed cluster. In order to scale up, algorithms can also be distributed
 across a cluster using Spark or Hadoop.
 This flexibility allows the utilization of an organization's existing resources and expertise.
-In addition, SystemML features a Spark MLContext API that allows for programmatic interaction via Scala and Java.
-SystemML also features an embedded API for scoring models.
+In addition, SystemML features a
+[Spark MLContext API](http://apache.github.io/incubator-systemml/spark-mlcontext-programming-guide.html)
+that allows for programmatic interaction via Scala, Python, and Java. SystemML also features an
+embedded API for scoring models.
 
 
-### Automatic Optimization
+## Automatic Optimization
 
 Algorithms specified in DML are dynamically compiled and optimized based on data and cluster characteristics
 using rule-based and cost-based optimization techniques. The optimizer automatically generates hybrid runtime
 execution plans ranging from in-memory, single-node execution, to distributed computations on Spark or Hadoop.
 This ensures both efficiency and scalability. Automatic optimization reduces or eliminates the need to hand-tune
 distributed runtime execution plans and system configurations.
-* * *
 
-## SystemML Assumptions
+## ML Algorithms
 
-Before you get started on SystemML, make sure that your environment is set up and ready to go.
+SystemML features a suite of production-level examples that can be grouped into six broad categories:
+Descriptive Statistics, Classification, Clustering, Regression, Matrix Factorization, and Survival Analysis.
+Detailed descriptions of these algorithms can be found in the
+[SystemML Algorithms Reference](http://apache.github.io/incubator-systemml/algorithms-reference.html).  The goal of these provided algorithms is to serve as production-level examples that can modified or used as inspiration for a new custom algorithm.
 
-  1. **If you\u2019re on a mac, you\u2019ll want to install homebrew (http://brew.sh) if you haven\u2019t already.**
+## Download & Setup
+
+Before you get started on SystemML, make sure that your environment is set up and ready to go.
 
-  *Copy and paste the following into your terminal.*
+  1. **If you\u2019re on OS X, we recommend installing [Homebrew](http://brew.sh) if you haven\u2019t already.  For Linux users, the [Linuxbrew project](http://linuxbrew.sh/) is equivalent.**
 
+  OS X:
   ```
   /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
   ```
-  2. **Now install Java (need Java 8).**
+  Linux:
+  ```
+  ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Linuxbrew/install/master/install)"
+  ```
+
+  2. **Install Java (need Java 8).**
   ```
   brew tap caskroom/cask
   brew install Caskroom/cask/java
   ```
-  
-  3. **Now install everything else you need**
-
-  *In order to install something on homebrew all you need to do is type "brew install" followed by what you want to install. See below.*
 
-  **Install Spark 1.6**
+  3. **Install Spark.**
   ```
   brew install apache-spark
   ```
-  **Install python**
-  ```
-  brew install python
-  ```
-  **Install jupyter and matplotlib and numpy.**
-  ```
-  pip install jupyter matplotlib numpy
-  ```
-  4. **Now you can install SystemML.**
 
-  Go to http://systemml.apache.org/download.html and click on the systemml-0.10.0-incubating zip (should be 2nd).
-
-  *The next step is optional, but it will make your life a lot easier.*
+  4. **Download SystemML.**
 
-  5. **Set `SPARK_HOME and SYSTEMML_HOME` on your bash profile.**
+  Go to the [SystemML Downloads page](http://systemml.apache.org/download.html), download `systemml-0.10.0-incubating.zip` (should be 2nd), and unzip it to a location of your choice.
 
-  *First, use vim to create/edit your bash profile. Not sure what vim is? Check https://www.linux.com/learn/vim-101-beginners-guide-vim.*
+  *The next step is optional, but it will make your life a lot easier.*
 
-  *We are going to insert our file where Spark and SystemML is stored into our bash profile. This will make it easier to access. **Type the following to open your bash profile using vim:***
-  ```
-  vim .bash_profile
-  ```
-  **Now you are in vim. First, type \u201ci\u201d for insert.**
+  5. **[OPTIONAL] Set `SYSTEMML_HOME` in your bash profile.**
+  Add the following to `~/.bash_profile`, replacing `path/to/` with the location of the download in step 5.
   ```
-  i
+  export SYSTEMML_HOME=path/to/systemml-0.10.0-incubating
   ```
-  **Now insert Spark and SystemML.** Note: /Documents is where I saved my Spark and SystemML. Be sure that your file path is accurate.
+  *Make sure to open a new tab in terminal so that you make sure the changes have been made.*
 
-  ```
-  export SPARK_HOME=/Users/stc/Documents/spark-1.5.1-bin-hadoop2.6
+  6. **[OPTIONAL] Install Python or Python 3 (to follow along with our Jupyter notebook examples).**
 
-  export SYSTEMML_HOME=/Users/stc/Documents/systemml-0.10.0-incubating
+  Python 2:
   ```
-  **Now type :wq to write the file and quit.**
-  ```
-  :wq
+  brew install python
+  pip install jupyter matplotlib numpy
   ```
-  *Make sure to open a new tab in terminal so that you make sure the changes have been made.*
-
-**Congrats! You can now run SystemML!**
-* * *
-
-## Building SystemML
-
-SystemML is built using [Apache Maven](http://maven.apache.org/).
-SystemML will build on Linux, MacOS, or Windows, and requires Maven 3 and Java 7 (or higher).
-To build SystemML, run:
-
-    mvn clean package
-
-To build the SystemML distributions (`.tar.gz`, `.zip`, etc.), run:
-
-    mvn clean package -P distribution
-
-
-* * *
-
-## Testing SystemML
-
-SystemML features a comprehensive set of integration tests. To perform these tests, run:
-
-    mvn verify
-
-Note: these tests require [R](https://www.r-project.org/) to be installed and available as part of the PATH variable on
-the machine on which you are running these tests.
-
-If required, please install the following packages in R:
-
-    install.packages(c("batch", "bitops", "boot", "caTools", "data.table", "doMC", "doSNOW", "ggplot2", "glmnet", "lda", "Matrix", "matrixStats", "moments", "plotrix", "psych", "reshape", "topicmodels", "wordcloud"), dependencies=TRUE)
-
-* * *
-
-## Importing SystemML into IDE
-
-This section describe how to import SystemML source code into an IDE.
-
-### Import SystemML Project to Eclipse
-
-Eclipse IDE include:
-* [Scala IDE](http://scala-ide.org/)
-* Eclipse Juno with scala plug-in
-
- File -> Import -> Maven -> Existing Maven Projects
-
-Please see below how to resolve some compilation issues that might occour after importing the SystemML project:
-
-##### `invalid cross-compiled libraries` error
-Since Scala IDE bundles the latest versions (2.10.5 and 2.11.6 at this point), you need do add one  in Eclipse Preferences -> Scala -> Installations by pointing to the lib/ directory of your Scala 2.10.4 distribution. Once this is done, select all Spark projects and right-click, choose Scala -> Set Scala Installation and point to the 2.10.4 installation. This should clear all errors about invalid cross-compiled libraries. A clean build should succeed now.
-
-##### `incompatation scala version ` error
-Change IDE scala version `project->propertiest->scala compiler -> scala installation`  to  `Fixed scala Installation: 2.10.5`
-
-##### `Not found type * ` error
-Run command `mvn package`, and do `project -> refresh`
-
-##### `maketplace not found ` error for Eclipse Luna
-Except scala IDE pulgin install, please make sure get update from "http://alchim31.free.fr/m2e-scala/update-site" to update maven connector for scala.
-
-### Import SystemML project to IntelliJ
-
- 1. Download IntelliJ and install the Scala plug-in for IntelliJ.
- 2. Go to "File -> Import Project", locate the spark source directory, and select "Maven Project".
- 3. In the Import wizard, it's fine to leave settings at their default. However it is usually useful to enable "Import Maven projects automatically", since changes to the project structure will automatically update the IntelliJ project.
-
-* * *
-
-## Running SystemML in Standalone Mode
-
-SystemML can run in distributed mode as well as in local standalone mode. We'll operate in standalone mode in this
-guide.
-After you build SystemML from source (`mvn clean package`), the standalone mode can be executed either on Linux or OS X
-using the `./bin/systemml` script, or on Windows using the `.\bin\systemml.bat` batch file.
-
-If you run from the script from the project root folder `./` or from the `./bin` folder, then the output files
-from running SystemML will be created inside the `./temp` folder to keep them separate from the SystemML source
-files managed by Git. The output files for all of the examples in this guide will be created under the `./temp`
-folder.
-
-The runtime behavior and logging behavior of SystemML can be customized by editing the files
-`./conf/SystemML-config.xml` and `./conf/log4j.properties`. Both files will be created from their corresponding
-`*.template` files during the first execution of the SystemML executable script.
-
-When invoking the `./bin/systemml` or `.\bin\systemml.bat` with any of the prepackaged DML scripts you can omit
-the relative path to the DML script file. The following two commands are equivalent:
-
-    ./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
-
-    ./bin/systemml genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
-
-In this guide we invoke the command with the relative folder to make it easier to look up the source of the DML scripts.
-
-
-* * *
-
-## ML Algorithms
-
-SystemML features a suite of algorithms that can be grouped into six broad categories:
-Descriptive Statistics, Classification, Clustering, Regression, Matrix Factorization, and Survival Analysis.
-Detailed descriptions of these algorithms can be found in the SystemML Algorithms Reference.
-
-* * *
-
-## Linear Regression Example
-
-As an example of the capabilities and power of SystemML and DML, let's consider the Linear Regression algorithm.
-We require sets of data to train and test our model. To obtain this data, we can either use real data or
-generate data for our algorithm. The
-[UCI Machine Learning Repository Datasets](https://archive.ics.uci.edu/ml/datasets.html) is one location for real data.
-Use of real data typically involves some degree of data wrangling. In the following example, we will use SystemML to
-generate random data to train and test our model.
-
-This example consists of the following parts:
-
-  * [Run DML Script to Generate Random Data](#run-dml-script-to-generate-random-data)
-  * [Divide Generated Data into Two Sample Groups](#divide-generated-data-into-two-sample-groups)
-  * [Split Label Column from First Sample](#split-label-column-from-first-sample)
-  * [Split Label Column from Second Sample](#split-label-column-from-second-sample)
-  * [Train Model on First Sample](#train-model-on-first-sample)
-  * [Test Model on Second Sample](#test-model-on-second-sample)
-
-SystemML is distributed in several packages, including a standalone package. We'll operate in Standalone mode in this
-example.
-
-<a name="run-dml-script-to-generate-random-data" />
-
-### Run DML Script to Generate Random Data
-
-We can execute the `genLinearRegressionData.dml` script in Standalone mode using either the `systemml` or `systemml.bat`
-file.
-In this example, we'll generate a matrix of 1000 rows of 50 columns of test data, with sparsity 0.7. In addition to
-this, a 51<sup>st</sup> column consisting of labels will
-be appended to the matrix.
-
-    ./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
-
-This generates the following files inside the `./temp` folder:
-
-    linRegData.csv      # 1000 rows of 51 columns of doubles (50 data columns and 1 label column), csv format
-    linRegData.csv.mtd  # Metadata file
-    perc.csv            # Used to generate two subsets of the data (for training and testing)
-    perc.csv.mtd        # Metadata file
-    scratch_space       # SystemML scratch_space directory
-
-<a name="divide-generated-data-into-two-sample-groups" />
-
-### Divide Generated Data into Two Sample Groups
-
-Next, we'll create two subsets of the generated data, each of size ~50%. We can accomplish this using the `sample.dml`
-script with the `perc.csv` file created in the previous step:
-
-    0.5
-    0.5
-
-
-The `sample.dml` script will randomly sample rows from the `linRegData.csv` file and place them into 2 files based
-on the percentages specified in `perc.csv`. This will create two sample groups of roughly 50 percent each.
-
-    ./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv
-
-
-This script creates two partitions of the original data and places them in a `linRegDataParts` folder. The files created
-are as follows:
-
-    linRegDataParts/1       # first partition of data, ~50% of rows of linRegData.csv, csv format
-    linRegDataParts/1.mtd   # metadata
-    linRegDataParts/2       # second partition of data, ~50% of rows of linRegData.csv, csv format
-    linRegDataParts/2.mtd   # metadata
-
-
-The `1` file contains the first partition of data, and the `2` file contains the second partition of data.
-An associated metadata file describes
-the nature of each partition of data. If we open `1` and `2` and look at the number of rows, we can see that typically
-the partitions are not exactly 50% but instead are close to 50%. However, we find that the total number of rows in the
-original data file equals the sum of the number of rows in `1` and `2`.
-
-
-<a name="split-label-column-from-first-sample" />
-
-### Split Label Column from First Sample
-
-The next task is to split the label column from the first sample. We can do this using the `splitXY.dml` script.
-
-    ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv
-
-This splits column 51, the label column, off from the data. When done, the following files have been created.
-
-    linRegData.train.data.csv        # training data of 50 columns, csv format
-    linRegData.train.data.csv.mtd    # metadata
-    linRegData.train.labels.csv      # training labels of 1 column, csv format
-    linRegData.train.labels.csv.mtd  # metadata
-
-
-<a name="split-label-column-from-second-sample" />
-
-### Split Label Column from Second Sample
-
-We also need to split the label column from the second sample.
-
-    ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv
-
-This splits column 51 off the data, resulting in the following files:
-
-    linRegData.test.data.csv        # test data of 50 columns, csv format
-    linRegData.test.data.csv.mtd    # metadata
-    linRegData.test.labels.csv      # test labels of 1 column, csv format
-    linRegData.test.labels.csv.mtd  # metadata
-
-
-<a name="train-model-on-first-sample" />
-
-### Train Model on First Sample
-
-Now, we can train our model based on the first sample. To do this, we utilize the `LinearRegDS.dml` (Linear Regression
-Direct Solve) script. Note that SystemML also includes a `LinearRegCG.dml` (Linear Regression Conjugate Gradient)
-algorithm for situations where the number of features is large.
-
-    ./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
-
-This will generate the following files:
-
-    betas.csv      # betas, 50 rows of 1 column, csv format
-    betas.csv.mtd  # metadata
-
-The LinearRegDS.dml script generates statistics to standard output similar to the following.
-
-	BEGIN LINEAR REGRESSION SCRIPT
-	Reading X and Y...
-	Calling the Direct Solver...
-	Computing the statistics...
-	AVG_TOT_Y,-2.160284487670675
-	STDEV_TOT_Y,66.86434576808432
-	AVG_RES_Y,-3.3127468704080085E-10
-	STDEV_RES_Y,1.7231785003947183E-8
-	DISPERSION,2.963950542926297E-16
-	PLAIN_R2,1.0
-	ADJUSTED_R2,1.0
-	PLAIN_R2_NOBIAS,1.0
-	ADJUSTED_R2_NOBIAS,1.0
-	PLAIN_R2_VS_0,1.0
-	ADJUSTED_R2_VS_0,1.0
-	Writing the output matrix...
-	END LINEAR REGRESSION SCRIPT
-
-Now that we have our `betas.csv`, we can test our model with our second set of data.
-
-
-<a name="test-model-on-second-sample" />
-
-### Test Model on Second Sample
-
-To test our model on the second sample, we can use the `GLM-predict.dml` script. This script can be used for both
-prediction and scoring. Here, we're using it for scoring since we include the `Y` named argument. Our `betas.csv`
-file is specified as the `B` named argument.
-
-    ./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
-
-This generates statistics similar to the following to standard output.
-
-	LOGLHOOD_Z,,FALSE,NaN
-	LOGLHOOD_Z_PVAL,,FALSE,NaN
-	PEARSON_X2,,FALSE,1.895530994504798E-13
-	PEARSON_X2_BY_DF,,FALSE,4.202951207327712E-16
-	PEARSON_X2_PVAL,,FALSE,1.0
-	DEVIANCE_G2,,FALSE,0.0
-	DEVIANCE_G2_BY_DF,,FALSE,0.0
-	DEVIANCE_G2_PVAL,,FALSE,1.0
-	LOGLHOOD_Z,,TRUE,NaN
-	LOGLHOOD_Z_PVAL,,TRUE,NaN
-	PEARSON_X2,,TRUE,1.895530994504798E-13
-	PEARSON_X2_BY_DF,,TRUE,4.202951207327712E-16
-	PEARSON_X2_PVAL,,TRUE,1.0
-	DEVIANCE_G2,,TRUE,0.0
-	DEVIANCE_G2_BY_DF,,TRUE,0.0
-	DEVIANCE_G2_PVAL,,TRUE,1.0
-	AVG_TOT_Y,1,,1.0069397725436522
-	STDEV_TOT_Y,1,,68.29092137526905
-	AVG_RES_Y,1,,-4.1450397073455047E-10
-	STDEV_RES_Y,1,,2.0519206226041048E-8
-	PRED_STDEV_RES,1,TRUE,1.0
-	PLAIN_R2,1,,1.0
-	ADJUSTED_R2,1,,1.0
-	PLAIN_R2_NOBIAS,1,,1.0
-	ADJUSTED_R2_NOBIAS,1,,1.0
-
-
-We see that the STDEV_RES_Y value of the testing phase is of similar magnitude
-to the value obtained from the model training phase.
-
-For convenience, we can encapsulate our DML invocations in a single script:
-
-	#!/bin/bash
-
-	./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
-
-	./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv
-
-	./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv
-
-	./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv
-
-	./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
-
-	./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
 
+  Python 3:
+  ```
+  brew install python3
+  pip3 install jupyter matplotlib numpy
+  ```
 
-* * *
+**Congrats! You can now use SystemML!**
 
-## Conclusion and Next Steps
+## Next Steps!
 
-In this example, we've seen a small part of the capabilities of SystemML. For more detailed information, please
-consult the [Apache SystemML (incubating)](http://systemml.apache.org/) website and the
-[SystemML Documentation](http://apache.github.io/incubator-systemml/) website on GitHub.
+To get started, please consult the
+[SystemML Documentation](http://apache.github.io/incubator-systemml/) website on GitHub.  We
+recommend using the [Spark MLContext API](http://apache.github.io/incubator-systemml/spark-mlcontext-programming-guide.html)
+to run SystemML from Scala or Python using `spark-shell`, `pyspark`, or `spark-submit`.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/Gemfile
----------------------------------------------------------------------
diff --git a/docs/Gemfile b/docs/Gemfile
deleted file mode 100644
index b5d5550..0000000
--- a/docs/Gemfile
+++ /dev/null
@@ -1,3 +0,0 @@
-source "https://rubygems.org"
-gem "jekyll"
-gem "rouge"
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/_layouts/global.html
----------------------------------------------------------------------
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index f2c74bd..3527421 100644
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -44,7 +44,7 @@
                             <ul class="dropdown-menu" role="menu">
                                 <li><b>Running SystemML:</b></li>
                                 <li><a href="https://github.com/apache/incubator-systemml">SystemML GitHub README</a></li>
-                                <li><a href="quick-start-guide.html">Quick Start Guide</a></li>
+                                <li><a href="standalone-guide.html">Quick Start Guide</a></li>
                                 <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext Programming Guide</a></li>
                                 <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a>
                                 <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/developer-tools-systemml.md
----------------------------------------------------------------------
diff --git a/docs/developer-tools-systemml.md b/docs/developer-tools-systemml.md
index d874393..44069cb 100644
--- a/docs/developer-tools-systemml.md
+++ b/docs/developer-tools-systemml.md
@@ -28,6 +28,15 @@ Useful Tools for Developing SystemML:
 * This will become a table of contents (this text will be scraped).
 {:toc}
 
+## IntelliJ
+
+IntelliJ can be used since it provides great support for mixed Java and Scala projects as described [here](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IntelliJ).
+
+### Import SystemML project to IntelliJ
+
+ 1. Download IntelliJ and install the Scala plug-in for IntelliJ.
+ 2. Go to "File -> Import Project", locate the spark source directory, and select "Maven Project".
+ 3. In the Import wizard, it's fine to leave settings at their default. However it is usually useful to enable "Import Maven projects automatically", since changes to the project structure will automatically update the IntelliJ project.
 
 ## Eclipse
 
@@ -40,7 +49,7 @@ To get started in Eclipse, import SystemML's pom.xml file as an existing Maven p
 ![Eclipse Details](img/developer-tools/eclipse-details.png "Eclipse Details")
 
 
-## Eclipse with Scala
+### Eclipse with Scala
 
 An additional Maven connector is required for working with Scala code in Eclipse.  The [Maven Integration for Scala IDE](http://scala-ide.org/docs/tutorials/m2eclipse/) plugin can be installed into Eclipse from [this](http://alchim31.free.fr/m2e-scala/update-site/) update site.  
 
@@ -59,7 +68,7 @@ Note the corresponding Eclipse project needs to include the Scala nature.  Typic
 ![Add Scala Nature](img/developer-tools/scala-nature.png "Add Scala Nature")
 
 
-## Eclipse Java Only (How to skip Scala)
+### Eclipse Java Only (How to skip Scala)
 
 Since the core SystemML code is written in Java, developers may prefer not to use Eclipse in a mixed Java/Scala environment.  To configure Eclipse to skip the Scala code of SystemML and avoid installing any Scala-related components, Maven lifecycle mappings can be created.  The simplest way to create these mappings is to use Eclipse's quick fix option to resolve the following pom.xml errors which occur if Maven Integration for Scala is not present.
 
@@ -69,7 +78,18 @@ Since the core SystemML code is written in Java, developers may prefer not to us
 
 The lifecycle mappings are stored in a workspace metadata file as specified in Eclipse's Maven Lifecycle Mappings Preferences page.  The pom.xml file itself is unchanged which allows the Scala portion to be built outside of Eclipse using mvn command line.
 
+## Troubleshooting
 
-## IntelliJ
+Please see below how to resolve some compilation issues that might occur after importing the SystemML project:
+
+##### `invalid cross-compiled libraries` error
+Since Scala IDE bundles the latest versions (2.10.5 and 2.11.6 at this point), you need do add one  in Eclipse Preferences -> Scala -> Installations by pointing to the lib/ directory of your Scala 2.10.4 distribution. Once this is done, select all Spark projects and right-click, choose Scala -> Set Scala Installation and point to the 2.10.4 installation. This should clear all errors about invalid cross-compiled libraries. A clean build should succeed now.
+
+##### `incompatible scala version ` error
+Change IDE scala version `project->propertiest->scala compiler -> scala installation`  to  `Fixed scala Installation: 2.10.5`
+
+##### `Not found type * ` error
+Run command `mvn package`, and do `project -> refresh`
 
-IntelliJ can also be used since it provides good support for mixed Java and Scala projects as described [here](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-IntelliJ).
+##### `maketplace not found ` error for Eclipse Luna
+Except scala IDE pulgin install, please make sure get update from "http://alchim31.free.fr/m2e-scala/update-site" to update maven connector for scala.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/dml-language-reference.md
----------------------------------------------------------------------
diff --git a/docs/dml-language-reference.md b/docs/dml-language-reference.md
index ee16085..82a7b24 100644
--- a/docs/dml-language-reference.md
+++ b/docs/dml-language-reference.md
@@ -61,15 +61,16 @@ limitations under the License.
 
 ## Introduction
 
-SystemML compiles scripts written in Declarative Machine Learning (or DML for short) into MapReduce jobs. DML\u2019s syntax closely follows R, thereby minimizing the learning curve to use SystemML. Before getting into detail, let\u2019s start with a simple Hello World program in DML. Assuming that Hadoop is installed on your machine or cluster, place SystemML.jar and SystemML-config.xml into your directory. Now, create a text file "hello.dml" containing following code:
+SystemML compiles scripts written in Declarative Machine Learning (or DML for short) into mixed driver and distributed jobs. DML\u2019s syntax closely follows R, thereby minimizing the learning curve to use SystemML. Before getting into detail, let\u2019s start with a simple Hello World program in DML. Assuming that Spark is installed on your machine or cluster, place `SystemML.jar` into your directory. Now, create a text file `hello.dml` containing following code:
 
     print("Hello World");
 
 To run this program on your machine, use following command:
 
-    hadoop jar SystemML.jar \u2013f hello.dml
+    spark-submit SystemML.jar -f hello.dml
 
-The option `-f` in the above command refers to the path to the DML script. The detailed list of the options is given in the section "Invocation of SystemML".
+The option `-f` in the above command refers to the path to the DML script. A detailed list of the
+available options can be found running `spark-submit SystemML.jar -help`.
 
 
 ## Variables
@@ -547,13 +548,13 @@ In above script, `ifdef(\$nbrRows, 10)` function is a short-hand for "`ifdef(\$n
 
 Let\u2019s assume that the above script is invoked using following the command line values:
 
-    hadoop jar SystemML.jar -f test.dml -nvargs fname=test.mtx nbrRows=5 nbrCols=5
+    spark-submit SystemML.jar -f test.dml -nvargs fname=test.mtx nbrRows=5 nbrCols=5
 
 In this case, the script will create a random matrix M with 5 rows and 5 columns and write it to the file "text.mtx" in csv format. After that it will print the message "Done creating and writing random matrix in test.mtx" on the standard output.
 
 If however, the above script is invoked from the command line using named arguments:
 
-    hadoop jar SystemML.jar -f test.dml -nvargs fname=test.mtx nbrCols=5
+    spark-submit SystemML.jar -f test.dml -nvargs fname=test.mtx nbrCols=5
 
 Then, the script will instead create a random matrix M with 10 rows (i.e. default value provided in the script) and 5 columns.
 
@@ -1539,170 +1540,3 @@ All reserved keywords are case-sensitive.
     String
     TRUE
     while
-
-
-## Invocation of SystemML
-
-To execute a DML script, SystemML is invoked as follows:
-
-    hadoop jar SystemML.jar [-? | -help | -f] <filename> (-config=<config_filename>)? (-args | -nvargs)? <args-list>?
-
-Where
-
-`-f�<filename>: will be interpreted as a path to file with DML script. <filename> prefixed with hdfs or gpfs is assumed path in DFS, otherwise <filename> treated as path on local file system`
-
-`-debug: (optional) run in debug mode`
-
-`-config=<config_filename>: (optional) use config file located at specified path <config_filename>. <config_filename> prefixed with hdfs or gpfs is assumed path in DFS, otherwise <config_filename> treated as path on local file system (default value for <config_filename> is ./SystemML-config.xml)`
-
-`-args <args-list>: (optional) parameterize DML script with contents of <args-list>, which is ALL args after -args flag. Each argument must be an unnamed-argument, where 1st value after -args will replace \$1 in DML script, 2nd value will replace \$2 in DML script, and so on.`
-
-`-nvargs <args-list>: (optional) parameterize DML script with contents of <args-list>, which is ALL args after -nvargs flag. Each argument must be named-argument of form name=value, where value will replace \$name in DML script.`
-
-`-?, or -help: show this help.`
-
-NOTE: Please refer to section on Command-line Arguments for more details and restrictions on usage of command-line arguments to DML script using `\u2013args <args-list> and \u2013nvargs <args-list>`.
-
-
-### Examples
-
-Run a script in local file foo.dml:
-
-    hadoop jar SystemML.jar -f foo.dml
-
-An example debug session:
-
-First, you need to call SystemML using \u2013debug flag.
-
-    hadoop jar SystemML.jar -f test.dml \u2013debug
-
-You can see the line numbers in your DML script by "list" (or simply "l") command:
-
-    (SystemMLdb) l
-    line��� 1: A = matrix("1 2 3 4 5 6", rows=3, cols=2)
-    line��� 2:
-    line��� 3: B = cumsum(A)
-    line��� 4: #print(B)
-    line��� 5: print(sum(B))
-
-The next step is usually to set a breakpoint where we need to analyze the state of our variables:
-
-    (SystemMLdb) b 5
-
-Breakpoint added at .defaultNS::main, line 5.
-
-Now, that we have set a breakpoint, we can start running our DML script:
-
-    (SystemMLdb) r
-    Breakpoint reached at .defaultNS::main instID 15: (line 5).
-    (SystemMLdb) p B
-    1.0000 �2.0000
-    4.0000� 6.0000
-    9.0000� 12.0000
-
-
-## MLContext API
-----
-
-The MLContext API allows users to pass RDDs as input/output to SystemML through Java, Scala, or Python.
-
-Typical usage for MLContext using Spark's Scala Shell is as follows:
-
-    scala> import org.apache.sysml.api.MLContext
-
-Create input DataFrame from CSV file and potentially perform some feature transformation
-
-    scala> val W = sqlContext.load(...)
-    scala> val H = sc.textFile("V.csv")
-    scala> val V = sc.textFile("V.text")
-
-Create MLContext
-
-    scala> val ml = new MLContext(sc)
-
-Register input and output DataFrame/RDD
-
-Supported formats are:
-
-  1. DataFrame
-  2. CSV/Text (as JavaRDD&lt;String&gt; or JavaPairRDD&lt;LongWritable, Text&gt;)
-  3. Binary blocked RDD (JavaPairRDD&lt;MatrixIndexes,MatrixBlock&gt;))
-
-Also overloaded to support metadata information such as format, rlen,
-clen, etc.
-
-Please note the variable names given below in quotes correspond to the
-variables in DML script.
-
-These variables need to have corresponding read/write associated in DML
-script.
-
-Currently, only matrix variables are supported through
-registerInput/registerOutput interface.
-
-To pass scalar variables, use named/positional arguments (described
-later) or wrap them into matrix variable.
-
-    scala> ml.registerInput("V", V)
-    scala> ml.registerInput("W", W, "csv")
-    scala> ml.registerInput("H", H, "text", 50, 1500)
-    scala> ml.registerOutput("H")
-    scala> ml.registerOutput("W")
-
-As DataFrame is internally converted to CSV format, one can skip
-providing dimensions.
-
-Call script with default arguments:
-
-    scala> val outputs = ml.execute("GNMF.dml")
-
-MLContext also supports calling script with positional arguments (args)
-and named arguments (nargs):
-
-    scala> val args = Array("V.mtx", "W.mtx", "H.mtx", "2000", "1500", "50", "1", "WOut.mtx", "HOut.mtx")
-    scala> val nargs = Map("maxIter"->"1")
-    scala> val outputs = ml.execute("GNMF.dml", args) # or ml.execute("GNMF.dml", nargs)
-
-We can then fetch the output RDDs in SystemML\u2019s binary blocked format or
-as DataFrame.
-
-    scala> val HOut = outputs.getDF(sqlContext, "H")
-    scala> val WOut = outputs. getBinaryBlockedRDD(sqlContext, "W")
-
-To register new input/outputs and to re-execute the script, it is
-recommended that you first reset MLContext
-
-    scala> ml.reset()
-    scala> ml.registerInput("V", newV)
-
-Though it is possible to re-run the script using different (or even same
-arguments), but using same registered input/outputs without reset, it is
-discouraged. This is because the symbol table entries would have been
-updated since last invocation:
-
-    scala> val new_outputs = ml.execute("GNMF.dml", new_args)
-
-The Python MLContext API is similar to Scala/Java MLContext API. Here is
-an example:
-
-    >>> from pyspark.sql import SQLContext
-    >>> from SystemML import MLContext
-    >>> sqlContext = SQLContext(sc)
-    >>> H = sqlContext.jsonFile("H.json")
-    >>> V = sqlContext.jsonFile("V.json")
-    >>> W = sqlContext.jsonFile("W.json")
-    >>> ml = MLContext(sc)
-    >>> ml.registerInput("V", V)
-    >>> ml.registerInput("W", W)
-    >>> ml.registerInput("H", H)
-    >>> ml.registerOutput("H")
-    >>> ml.registerOutput("W")
-    >>> outputs = ml.execute("GNMF.dml")
-
-Note:
-
--   The current version does not allow users to create multiple
-    MLContexts and only allows one thread to execute DML script using
-    the created MLContext.
--   Even though the above example shows the usage through Scala/Python
-    Shell, it works for Spark-Submit and PySpark-Submit as well.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/engine-dev-guide.md
----------------------------------------------------------------------
diff --git a/docs/engine-dev-guide.md b/docs/engine-dev-guide.md
new file mode 100644
index 0000000..634846a
--- /dev/null
+++ b/docs/engine-dev-guide.md
@@ -0,0 +1,56 @@
+---
+layout: global
+displayTitle: SystemML Engine Developer Guide
+title: SystemML Engine Developer Guide
+description: SystemML Engine Developer Guide
+---
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+## Building SystemML
+
+SystemML is built using [Apache Maven](http://maven.apache.org/).
+SystemML will build on Linux, MacOS, or Windows, and requires Maven 3 and Java 7 (or higher).
+To build SystemML, run:
+
+    mvn clean package
+
+To build the SystemML distributions (`.tar.gz`, `.zip`, etc.), run:
+
+    mvn clean package -P distribution
+
+
+* * *
+
+## Testing SystemML
+
+SystemML features a comprehensive set of integration tests. To perform these tests, run:
+
+    mvn verify
+
+Note: these tests require [R](https://www.r-project.org/) to be installed and available as part of the PATH variable on
+the machine on which you are running these tests.
+
+If required, please install the following packages in R:
+
+    install.packages(c("batch", "bitops", "boot", "caTools", "data.table", "doMC", "doSNOW", "ggplot2", "glmnet", "lda", "Matrix", "matrixStats", "moments", "plotrix", "psych", "reshape", "topicmodels", "wordcloud"), dependencies=TRUE)
+
+* * *

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/hadoop-batch-mode.md
----------------------------------------------------------------------
diff --git a/docs/hadoop-batch-mode.md b/docs/hadoop-batch-mode.md
index 96b32a8..ddc1c1f 100644
--- a/docs/hadoop-batch-mode.md
+++ b/docs/hadoop-batch-mode.md
@@ -47,9 +47,7 @@ refer to the Hadoop documentation.
 
 # Hadoop Batch Mode Invocation Syntax
 
-As described in the [Invocation of SystemML](dml-language-reference.html#invocation-of-systemml) section
-of the [DML Language Reference](dml-language-reference.html), SystemML can be invoked in Hadoop Batch mode using
-the following syntax:
+SystemML can be invoked in Hadoop Batch mode using the following syntax:
 
     hadoop jar SystemML.jar [-? | -help | -f <filename>] (-config=<config_filename>) ([-args | -nvargs] <args-list>)
 
@@ -110,14 +108,14 @@ that I unpacked. I updated the `PATH` variable to include the `JAVA_HOME` `bin`
 and the `HADOOP_HOME` `sbin` directory.
 
 	[hadoop@host1 ~]# vi .bash_profile
-	
+
 	...
 	export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
 	export HADOOP_HOME=/home/hadoop/hadoop-2.6.2
 	PATH=$JAVA_HOME/bin:$PATH:$HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
 	export PATH
 	...
-	
+
 	[hadoop@host1 ~]$ source ~/.bash_profile
 
 To verify that Java and Hadoop were on the path, I used the `java -version` and `hadoop version` commands.
@@ -167,7 +165,7 @@ arguments to the DML script were specified following the `-nvargs` option.
 	15/11/11 15:56:22 INFO api.DMLScript: SystemML Statistics:
 	Total execution time:		0.288 sec.
 	Number of executed MR Jobs:	0.
-	
+
 	15/11/11 15:56:22 INFO api.DMLScript: END DML run 11/11/2015 15:56:22
 
 In the console output, we see a warning that no default SystemML config file was found in the current working directory.
@@ -207,7 +205,7 @@ To clean things up, I'll delete the files that were generated.
 Next, we'll look at running SystemML with Hadoop in Pseudo-Distributed mode. In Pseudo-Distributed mode, each Hadoop daemon
 (such as NameNode and DataNode) runs in a separate Java process on a single machine.
 
-In the previous section about Hadoop Standalone mode, we set up the `JAVA_HOME` and `HADOOP_HOME` environment variables 
+In the previous section about Hadoop Standalone mode, we set up the `JAVA_HOME` and `HADOOP_HOME` environment variables
 and added `JAVA_HOME/bin`, `HADOOP_HOME/bin`, and `HADOOP_HOME/sbin` to the `PATH` in `.bash_profile`.
 
 We also need to set the `JAVA_HOME` value in the `hadoop-env.sh` file in the Hadoop configuration directory (`etc/hadoop`).
@@ -215,7 +213,7 @@ We also need to set the `JAVA_HOME` value in the `hadoop-env.sh` file in the Had
 	[hadoop@host1 hadoop]$ pwd
 	/home/hadoop/hadoop-2.6.2/etc/hadoop
 	[hadoop@host1 hadoop]$ vi hadoop-env.sh
-	
+
 	...
 	export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
 	...
@@ -247,8 +245,8 @@ a password.
 In the Hadoop configuration directory (`etc/hadoop`), in the `core-site.xml` file, we specify the `fs.defaultFS`
 property to be `localhost` with port `9000`.
 
-	[hadoop@host1 hadoop]$ vi core-site.xml 
-	
+	[hadoop@host1 hadoop]$ vi core-site.xml
+
 	...
 	<configuration>
 	    <property>
@@ -261,8 +259,8 @@ property to be `localhost` with port `9000`.
 By default, HDFS replicates data on three nodes. Since we're running on a single machine, we'll change this to one.
 We'll add a `dfs.replication` property to `hdfs-site.xml` and set its value to `1`.
 
-	[hadoop@host1 hadoop]$ vi hdfs-site.xml 
-	
+	[hadoop@host1 hadoop]$ vi hdfs-site.xml
+
 	...
 	<configuration>
 	    <property>
@@ -275,7 +273,7 @@ We'll add a `dfs.replication` property to `hdfs-site.xml` and set its value to `
 Next, we'll format HDFS.
 
 	[hadoop@host1 ~]$ hdfs namenode -format
-	15/11/11 17:23:33 INFO namenode.NameNode: STARTUP_MSG: 
+	15/11/11 17:23:33 INFO namenode.NameNode: STARTUP_MSG:
 	/************************************************************
 	STARTUP_MSG: Starting NameNode
 	STARTUP_MSG:   host = host1.example.com/9.30.252.15
@@ -342,7 +340,7 @@ Let's go ahead and execute the `genLinearRegressionData.dml` script in Hadoop Ps
 	15/11/11 18:16:35 INFO api.DMLScript: SystemML Statistics:
 	Total execution time:		1.484 sec.
 	Number of executed MR Jobs:	0.
-	
+
 	15/11/11 18:16:35 INFO api.DMLScript: END DML run 11/11/2015 18:16:35
 
 If we list the contents of the current directory in our regular file system, we see that no files have been written
@@ -413,7 +411,7 @@ In the `yarn-site.xml` configuration file, we specify the `yarn.nodemanager.aux-
 to be `mapreduce_shuffle`.
 
 	[hadoop@host1 hadoop]$ vi yarn-site.xml
-	
+
 	...
 	<configuration>
 	    <property>
@@ -450,7 +448,7 @@ daemons (ResourceManager and NodeManager) are running.
 	51712 DataNode
 	51880 SecondaryNameNode
 
-We can now view YARN information via the web interface on port 8088 (http://host1.example.com:8088). 
+We can now view YARN information via the web interface on port 8088 (http://host1.example.com:8088).
 
 I'll execute the `genLinearRegressionData.dml` example that we've previously considered.
 
@@ -465,7 +463,7 @@ I'll execute the `genLinearRegressionData.dml` example that we've previously con
 	15/11/12 11:57:07 INFO api.DMLScript: SystemML Statistics:
 	Total execution time:		1.265 sec.
 	Number of executed MR Jobs:	0.
-	
+
 	15/11/12 11:57:07 INFO api.DMLScript: END DML run 11/12/2015 11:57:07
 
 If we examine the HDFS file system, we see the files generated by the execution of the DML script by SystemML on Hadoop.
@@ -529,12 +527,12 @@ First, I created a hadoop user on each slave node.
 	[root@host2 ~]# useradd hadoop
 	[root@host2 ~]# passwd hadoop
 	[root@host2 ~]# exit
-	
+
 	[root@host1 ~]$ ssh root@host3.example.com
 	[root@host2 ~]# useradd hadoop
 	[root@host2 ~]# passwd hadoop
 	[root@host2 ~]# exit
-	
+
 	[root@host1 ~]$ ssh root@host4.example.com
 	[root@host2 ~]# useradd hadoop
 	[root@host2 ~]# passwd hadoop
@@ -547,17 +545,17 @@ tested the passwordless login from the master node to each of the slave nodes fo
 user.
 
 	$ ssh hadoop@host1.example.com
-	
+
 	[hadoop@host1 ~]$ ssh-copy-id host2.example.com
 	[hadoop@host1 ~]$ ssh hadoop@host2.example.com
 	Last login: Thu Nov 12 14:16:21 2015
 	[hadoop@host2 ~]$ exit
-	
+
 	[hadoop@host1 ~]$ ssh-copy-id host3.example.com
 	[hadoop@host1 ~]$ ssh hadoop@host3.example.com
 	Last login: Thu Nov 12 14:16:40 2015
 	[hadoop@host3 ~]$ exit
-	
+
 	[hadoop@host1 ~]$ ssh-copy-id host4.example.com
 	[hadoop@host1 ~]$ ssh hadoop@host4.example.com
 	Last login: Thu Nov 12 14:17:10 2015
@@ -575,7 +573,7 @@ On the master node, I specified the slave nodes in the Hadoop `slaves` configura
 In the `core-site.xml` file, I specified the `fs.defaultFS` property to reference the master node.
 
 	[hadoop@host1 hadoop]$ more core-site.xml
-	
+
 	...
 	<configuration>
 	    <property>
@@ -590,7 +588,7 @@ In the `hdfs-site.xml` configuration file, I removed the previous `dfs.replicati
 will use the default replication value (of 3).
 
 	[hadoop@host1 hadoop]$ more hdfs-site.xml
-	
+
 	...
 	<configuration>
 	</configuration>
@@ -604,7 +602,7 @@ Furthermore, we'll set `mapreduce.map.memory.mb` and `mapreduce.reduce.memory.mb
 values are set to at least 1.5 times the value of the maximum heap size.
 
 	[hadoop@host1 hadoop]$ more mapred-site.xml
-	
+
 	...
 	<configuration>
 	    <property>
@@ -634,7 +632,7 @@ In the `yarn-site.xml` configuration file, I added a `yarn.resourcemanager.hostn
 the master node as the host.
 
 	[hadoop@host1 hadoop]$ more yarn-site.xml
-	
+
 	...
 	<configuration>
 	    <property>
@@ -652,7 +650,7 @@ In the previous example, we specified the `JAVA_HOME` in the `hadoop-env.sh` con
 We will use that same value.
 
 	[hadoop@host1 hadoop]$ more hadoop-env.sh
-	
+
 	...
 	export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
 	...
@@ -744,13 +742,13 @@ If we look at the Hadoop (on port 50070) and YARN (on port 8088) web interfaces,
 Let's go ahead and run the SystemML example from the GitHub README.
 
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
-	
+
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f systemml-{{site.SYSTEMML_VERSION}}/algorithms/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv
-	
+
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f systemml-{{site.SYSTEMML_VERSION}}/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv
-	
+
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f systemml-{{site.SYSTEMML_VERSION}}/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv
-	
+
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f systemml-{{site.SYSTEMML_VERSION}}/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
 	...
 	BEGIN LINEAR REGRESSION SCRIPT
@@ -773,7 +771,7 @@ Let's go ahead and run the SystemML example from the GitHub README.
 	15/11/17 15:50:34 INFO api.DMLScript: SystemML Statistics:
 	Total execution time:		0.480 sec.
 	...
-	
+
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f systemml-{{site.SYSTEMML_VERSION}}/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
 	...
 	LOGLHOOD_Z,,FALSE,NaN
@@ -863,7 +861,7 @@ executing SystemML in Hadoop from my home directory rather than from the SystemM
 
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f genRandData4Kmeans.dml -config=systemml-{{site.SYSTEMML_VERSION}}/SystemML-config.xml -nvargs nr=1000000 nf=100 nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx
 
-After the data generation has finished, I'll check HDFS for the amount of space used. The 1M-row matrix `X.mtx` 
+After the data generation has finished, I'll check HDFS for the amount of space used. The 1M-row matrix `X.mtx`
 requires about 2.8GB of space.
 
 	[hadoop@host1 ~]$ hdfs dfs -df -h
@@ -895,7 +893,7 @@ Here we can see the `X.mtx` data files.
 	-rw-r--r--   1 hadoop supergroup  481624723 2015-11-19 11:56 X.mtx/2-r-00004
 	-rw-r--r--   1 hadoop supergroup  481624048 2015-11-19 11:56 X.mtx/2-r-00005
 
-Next, I'll run the `Kmeans.dml` algorithm on the 1M-row matrix `X.mtx`. 
+Next, I'll run the `Kmeans.dml` algorithm on the 1M-row matrix `X.mtx`.
 
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f systemml-{{site.SYSTEMML_VERSION}}/algorithms/Kmeans.dml -config=/systemml-{{site.SYSTEMML_VERSION}}/SystemML-config.xml -nvargs X=X.mtx k=5 C=Centroids.mtx
 
@@ -920,7 +918,7 @@ the `Kmeans-predict.dml` script.
 
 	[hadoop@host1 ~]$ hadoop jar systemml-{{site.SYSTEMML_VERSION}}/SystemML.jar -f systemml-{{site.SYSTEMML_VERSION}}/algorithms/Kmeans-predict.dml -config=systemml-{{site.SYSTEMML_VERSION}}/SystemML-config.xml -nvargs X=X.mtx C=Centroids.mtx prY=PredY.mtx O=stats.txt
 
-In the file system, we can see that the `PredY.mtx` matrix was created. 
+In the file system, we can see that the `PredY.mtx` matrix was created.
 The `stats.txt` file lists statistics about the results.
 
 	[hadoop@host1 ~]$ hdfs dfs -ls
@@ -1019,6 +1017,3 @@ in a clustered environment.
     </tr>
   </tbody>
 </table>
-
-
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/index.md
----------------------------------------------------------------------
diff --git a/docs/index.md b/docs/index.md
index 56f4ed4..738e525 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,59 +30,58 @@ SystemML is a flexible, scalable machine learning system.
 SystemML's distinguishing characteristics are:
 
   1. **Algorithm customizability via R-like and Python-like languages**.
-  2. **Multiple execution modes**, including Standalone, Spark Batch, Spark MLContext, Hadoop Batch, and JMLC.
+  2. **Multiple execution modes**, including Spark MLContext, Spark Batch, Hadoop Batch, Standalone, and JMLC.
   3. **Automatic optimization** based on data and cluster characteristics to ensure both efficiency and scalability.
 
 The [**SystemML GitHub README**](https://github.com/apache/incubator-systemml) describes
-building, testing, and running SystemML. Please read [**Contributing to SystemML**](contributing-to-systemml.html)
+building, testing, and running SystemML. Please read [**Contributing to SystemML**](contributing-to-systemml)
 to find out how to help make SystemML even better!
 
-To download SystemML, visit the [downloads](http://systemml.apache.org/download.html) page.
+To download SystemML, visit the [downloads](http://systemml.apache.org/download) page.
 
 
 ## Running SystemML
 
-* **Standalone** - Standalone mode allows data scientists to rapidly prototype algorithms on a single
+* **[Spark MLContext](spark-mlcontext-programming-guide)** - Spark MLContext is a programmatic API
+for running SystemML from Spark via Scala, Python, or Java.
+  * See the [Spark MLContext Programming Guide](spark-mlcontext-programming-guide) with the
+  following examples:
+    * [**Spark Shell (Scala)**](spark-mlcontext-programming-guide#spark-shell-example---new-api)
+    * [**Zeppelin Notebook (Scala)**](spark-mlcontext-programming-guide#zeppelin-notebook-example---linear-regression-algorithm---old-api)
+    * [**Jupyter Notebook (PySpark)**](spark-mlcontext-programming-guide#jupyter-pyspark-notebook-example---poisson-nonnegative-matrix-factorization---old-api)
+* **[Spark Batch](spark-batch-mode)** - Algorithms are automatically optimized to run across Spark clusters.
+  * See [Invoking SystemML in Spark Batch Mode](spark-batch-mode) for detailed information.
+* **[Hadoop Batch](hadoop-batch-mode)** - Algorithms are automatically optimized when distributed across Hadoop clusters.
+  * See [Invoking SystemML in Hadoop Batch Mode](hadoop-batch-mode) for detailed information.
+* **[Standalone](standalone-guide)** - Standalone mode allows data scientists to rapidly prototype algorithms on a single
 machine in R-like and Python-like declarative languages.
-  * The [SystemML GitHub README](https://github.com/apache/incubator-systemml) describes
-  a linear regression example in Standalone Mode.
-  * The [Quick Start Guide](quick-start-guide.html) provides additional examples of algorithm execution
+  * The [Standalone Guide](standalone-guide) provides examples of algorithm execution
   in Standalone Mode.
-* **Spark MLContext** - Spark MLContext is a programmatic API for running SystemML from Spark via Scala, Python, or Java.
-  * See the [Spark MLContext Programming Guide](spark-mlcontext-programming-guide.html) for
-  [**Spark Shell (Scala)**](spark-mlcontext-programming-guide.html#spark-shell-example),
-  [Java](spark-mlcontext-programming-guide.html#java-example), 
-  [**Zeppelin Notebook**](spark-mlcontext-programming-guide.html#zeppelin-notebook-example---linear-regression-algorithm),
-  and [**Jupyter Notebook (PySpark)**](spark-mlcontext-programming-guide.html#jupyter-pyspark-notebook-example---poisson-nonnegative-matrix-factorization)
-  examples.
-* **Spark Batch** - Algorithms are automatically optimized to run across Spark clusters.
-  * See **Invoking SystemML in Spark Batch Mode** **(Coming soon)**.
-* **Hadoop Batch** - Algorithms are automatically optimized when distributed across Hadoop clusters.
-  * See [Invoking SystemML in Hadoop Batch Mode](hadoop-batch-mode.html) for detailed information.
-* **JMLC** - Java Machine Learning Connector.
-  * See [Java Machine Learning Connector (JMLC)](jmlc.html) for more information.
+* **[JMLC](jmlc)** - Java Machine Learning Connector.
+  * See [Java Machine Learning Connector (JMLC)](jmlc) for more information.
 
 ## Language Guides
 
-* [DML Language Reference](dml-language-reference.html) -
+* [DML Language Reference](dml-language-reference) -
 DML is a high-level R-like declarative language for machine learning.
 * **PyDML Language Reference** **(Coming Soon)** -
 PyDML is a high-level Python-like declarative language for machine learning.
-* [Beginner's Guide to DML and PyDML](beginners-guide-to-dml-and-pydml.html) -
+* [Beginner's Guide to DML and PyDML](beginners-guide-to-dml-and-pydml) -
 An introduction to the basics of DML and PyDML.
 
 ## ML Algorithms
 
-* [Algorithms Reference](algorithms-reference.html) - The Algorithms Reference describes the
+* [Algorithms Reference](algorithms-reference) - The Algorithms Reference describes the
 machine learning algorithms included with SystemML in detail.
 
 ## Tools
 
-* [Debugger Guide](debugger-guide.html) - SystemML supports DML script-level debugging through a
+* [Debugger Guide](debugger-guide) - SystemML supports DML script-level debugging through a
 command-line interface.
-* [IDE Guide](developer-tools-systemml.html) - Useful IDE Guide for Developing SystemML.
+* [IDE Guide](developer-tools-systemml) - Useful IDE Guide for Developing SystemML.
 
 ## Other
 
-* [Contributing to SystemML](contributing-to-systemml.html) - Describes ways to contribute to SystemML.
-* [Troubleshooting Guide](troubleshooting-guide.html) - Troubleshoot various issues related to SystemML.
+* [Contributing to SystemML](contributing-to-systemml) - Describes ways to contribute to SystemML.
+* [Engine Developer Guide](engine-dev-guide) - Guide for internal SystemML engine development.
+* [Troubleshooting Guide](troubleshooting-guide) - Troubleshoot various issues related to SystemML.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/quick-start-guide.md
----------------------------------------------------------------------
diff --git a/docs/quick-start-guide.md b/docs/quick-start-guide.md
deleted file mode 100644
index f05db25..0000000
--- a/docs/quick-start-guide.md
+++ /dev/null
@@ -1,399 +0,0 @@
----
-layout: global
-title: SystemML Quick Start Guide
-description: SystemML Quick Start Guide
-displayTitle: SystemML Quick Start Guide
----
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% endcomment %}
--->
-
-* This will become a table of contents (this text will be scraped).
-{:toc}
-
-<br/>
-
-This tutorial provides a quick introduction to using SystemML by
-running existing SystemML algorithms in standalone mode.
-
-
-# What is SystemML
-
-SystemML enables large-scale machine learning (ML) via a high-level declarative
-language with R-like syntax called [DML](dml-language-reference.html) and
-Python-like syntax called PyDML. DML and PyDML allow data scientists to
-express their ML algorithms with full flexibility but without the need to fine-tune
-distributed runtime execution plans and system configurations.
-These ML programs are dynamically compiled and optimized based on data
-and cluster characteristics using rule-based and cost-based optimization techniques.
-The compiler automatically generates hybrid runtime execution plans ranging
-from in-memory, single node execution to distributed computation for Hadoop
-or Spark Batch execution.
-SystemML features a suite of algorithms for Descriptive Statistics, Classification,
-Clustering, Regression, Matrix Factorization, and Survival Analysis. Detailed descriptions of these
-algorithms can be found in the [Algorithms Reference](algorithms-reference.html).
-
-<br/>
-
-# Download SystemML
-
-Apache incubator releases of SystemML are available from the [downloads](http://systemml.apache.org/download.html) page.
-
-The SystemML project is available on GitHub at [https://github.com/apache/incubator-systemml](https://github.com/apache/incubator-systemml).
-SystemML can be downloaded from GitHub and built with Maven. Instructions to build and
-test SystemML can be found in the [SystemML GitHub README](https://github.com/apache/incubator-systemml).
-
-<br/>
-
-# Standalone vs Distributed Execution Mode
-
-SystemML's standalone mode is designed to allow data scientists to rapidly prototype algorithms
-on a single machine. The standalone release packages all required libraries into a single distribution file.
-In standalone mode, all operations occur on a single node in a non-Hadoop environment. Standalone mode
-is not appropriate for large datasets.
-
-For large-scale production environments, SystemML algorithm execution can be
-distributed across multi-node clusters using [Apache Hadoop](https://hadoop.apache.org/)
-or [Apache Spark](http://spark.apache.org/).
-We will make use of standalone mode throughout this tutorial.
-
-<br/>
-
-# Contents of the SystemML Standalone Package
-
-To follow along with this guide, first build a standalone package of SystemML
-using [Apache Maven](http://maven.apache.org)
-and unpack it.
-
-    $ git clone https://github.com/apache/incubator-systemml.git
-    $ cd incubator-systemml
-    $ mvn clean package -P distribution
-    $ tar -xvzf target/systemml-*-standalone.tar.gz -C ..
-    $ cd ..
-
-The extracted package should have these contents:
-
-    $ ls -lF systemml-{{site.SYSTEMML_VERSION}}/
-    total 96
-    -rw-r--r--  LICENSE
-    -rw-r--r--  NOTICE
-    -rw-r--r--  SystemML-config.xml
-    drwxr-xr-x  docs/
-    drwxr-xr-x  lib/
-    -rw-r--r--  log4j.properties
-    -rw-r--r--  readme.txt
-    -rwxr-xr-x  runStandaloneSystemML.bat*
-    -rwxr-xr-x  runStandaloneSystemML.sh*
-    drwxr-xr-x  scripts/
-
-For the rest of the tutorial we will switch to the `systemml-{{site.SYSTEMML_VERSION}}` directory.
-
-    $ cd  ~/systemml-{{site.SYSTEMML_VERSION}}
-
-Note that standalone mode supports both Mac/UNIX and Windows. To run the following examples on
-Windows, the "`./runStandaloneSystemML.sh ...`" commands can be replaced with
-"`./runStandaloneSystemML.bat ...`" commands.
-
-<br/>
-
-# Choosing Test Data
-
-In this tutorial we will use the [Haberman's Survival Data Set](http://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival)
-which can be downloaded in CSV format from the [Center for Machine Learning and Intelligent Systems](http://cml.ics.uci.edu/)
-
-    $ wget -P data/ http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data
-
-The [Haberman Data Set](http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.names)
-has 306 instances and 4 attributes (including the class attribute):
-
- 1. Age of patient at time of operation (numerical)
- 2. Patient's year of operation (year - 1900, numerical)
- 3. Number of positive axillary nodes detected (numerical)
- 4. Survival status (class attribute)
-   * `1` = the patient survived 5 years or longer
-   * `2` = the patient died within 5 year
-
-
-We will need to create a metadata file (MTD) which stores metadata information
-about the content of the data file. The name of the MTD file associated with the
-data file `<filename>` must be `<filename>.mtd`.
-
-    $ echo '{"rows": 306, "cols": 4, "format": "csv"}' > data/haberman.data.mtd
-
-<br/>
-
-# Example 1 - Univariate Statistics
-
-Let's start with a simple example, computing certain [univariate statistics](algorithms-descriptive-statistics.html#univariate-statistics)
-for each feature column using the algorithm `Univar-Stats.dml` which requires 3
-[arguments](algorithms-descriptive-statistics.html#arguments):
-
-* `X`:  location of the input data file to analyze
-* `TYPES`:  location of the file that contains the feature column types encoded by integer numbers: `1` = scale, `2` = nominal, `3` = ordinal
-* `STATS`:  location where the output matrix of computed statistics is to be stored
-
-We need to create a file `types.csv` that describes the type of each column in
-the data along with its metadata file `types.csv.mtd`.
-
-    $ echo '1,1,1,2' > data/types.csv
-    $ echo '{"rows": 1, "cols": 4, "format": "csv"}' > data/types.csv.mtd
-
-
-To run the `Univar-Stats.dml` algorithm, issue the following command (we set the optional argument `CONSOLE_OUTPUT` to `TRUE` to print the statistics to the console):
-
-    $ ./runStandaloneSystemML.sh scripts/algorithms/Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE
-      
-    [...]
-    -------------------------------------------------
-    Feature [1]: Scale
-     (01) Minimum             | 30.0
-     (02) Maximum             | 83.0
-     (03) Range               | 53.0
-     (04) Mean                | 52.45751633986928
-     (05) Variance            | 116.71458266366658
-     (06) Std deviation       | 10.803452349303281
-     (07) Std err of mean     | 0.6175922641866753
-     (08) Coeff of variation  | 0.20594669940735139
-     (09) Skewness            | 0.1450718616532357
-     (10) Kurtosis            | -0.6150152487211726
-     (11) Std err of skewness | 0.13934809593495995
-     (12) Std err of kurtosis | 0.277810485320835
-     (13) Median              | 52.0
-     (14) Interquartile mean  | 52.16013071895425
-    -------------------------------------------------
-    Feature [2]: Scale
-     (01) Minimum             | 58.0
-     (02) Maximum             | 69.0
-     (03) Range               | 11.0
-     (04) Mean                | 62.85294117647059
-     (05) Variance            | 10.558630665380907
-     (06) Std deviation       | 3.2494046632238507
-     (07) Std err of mean     | 0.18575610076612029
-     (08) Coeff of variation  | 0.051698529971741194
-     (09) Skewness            | 0.07798443581479181
-     (10) Kurtosis            | -1.1324380182967442
-     (11) Std err of skewness | 0.13934809593495995
-     (12) Std err of kurtosis | 0.277810485320835
-     (13) Median              | 63.0
-     (14) Interquartile mean  | 62.80392156862745
-    -------------------------------------------------
-    Feature [3]: Scale
-     (01) Minimum             | 0.0
-     (02) Maximum             | 52.0
-     (03) Range               | 52.0
-     (04) Mean                | 4.026143790849673
-     (05) Variance            | 51.691117539912135
-     (06) Std deviation       | 7.189653506248555
-     (07) Std err of mean     | 0.41100513466216837
-     (08) Coeff of variation  | 1.7857418611299172
-     (09) Skewness            | 2.954633471088322
-     (10) Kurtosis            | 11.425776549251449
-     (11) Std err of skewness | 0.13934809593495995
-     (12) Std err of kurtosis | 0.277810485320835
-     (13) Median              | 1.0
-     (14) Interquartile mean  | 1.2483660130718954
-    -------------------------------------------------
-    Feature [4]: Categorical (Nominal)
-     (15) Num of categories   | 2
-     (16) Mode                | 1
-     (17) Num of modes        | 1
-  
-
-The `Univar-Stats.dml` script writes the computed statistics to the `univarOut.mtx` file. The matrix has one row per univariate statistic and one column per input feature. The first column gives the number of the statistic 
-(see above table), the second column gives the number of the feature column in
-the input data, and the third column gives the value of the univariate statistic.
-
-    1 1 30.0
-    1 2 58.0
-    2 1 83.0
-    2 2 69.0
-    2 3 52.0
-    3 1 53.0
-    3 2 11.0
-    3 3 52.0
-    4 1 52.45751633986928
-    4 2 62.85294117647059
-    4 3 4.026143790849673
-    5 1 116.71458266366658
-    5 2 10.558630665380907
-    5 3 51.691117539912135
-    6 1 10.803452349303281
-    6 2 3.2494046632238507
-    6 3 7.189653506248555
-    7 1 0.6175922641866753
-    7 2 0.18575610076612029
-    7 3 0.41100513466216837
-    8 1 0.20594669940735139
-    8 2 0.051698529971741194
-    8 3 1.7857418611299172
-    9 1 0.1450718616532357
-    9 2 0.07798443581479181
-    9 3 2.954633471088322
-    10 1 -0.6150152487211726
-    10 2 -1.1324380182967442
-    10 3 11.425776549251449
-    11 1 0.13934809593495995
-    11 2 0.13934809593495995
-    11 3 0.13934809593495995
-    12 1 0.277810485320835
-    12 2 0.277810485320835
-    12 3 0.277810485320835
-    13 1 52.0
-    13 2 63.0
-    13 3 1.0
-    14 1 52.16013071895425
-    14 2 62.80392156862745
-    14 3 1.2483660130718954
-    15 4 2.0
-    16 4 1.0
-    17 4 1.0
-
-
-<br/>
-<br/>
-
-# Example 2 - Binary-class Support Vector Machines
-
-Let's take the same `haberman.data` to explore the
-[binary-class support vector machines](algorithms-classification.html#binary-class-support-vector-machines) algorithm `l2-svm.dml`.
-This example also illustrates how to use of the sampling algorithm `sample.dml`
-and the data split algorithm `spliXY.dml`.
-
-## Sampling the Test Data
-
-First we need to use the `sample.dml` algorithm to separate the input into one
-training data set and one data set for model prediction.
-
-Parameters:
-
- * `X`       : (input)  input data set: filename of input data set
- * `sv`      : (input)  sampling vector: filename of 1-column vector w/ percentages. sum(sv) must be 1.
- * `O`       : (output) folder name w/ samples generated
- * `ofmt`    : (output) format of O: "csv", "binary" (default)
-
-
-We will create the file `perc.csv` and `perc.csv.mtd` to define the sampling vector with a sampling rate of
-50% to generate 2 data sets:
-
-    $ printf "0.5\n0.5" > data/perc.csv
-    $ echo '{"rows": 2, "cols": 1, "format": "csv"}' > data/perc.csv.mtd
-
-Let's run the sampling algorithm to create the two data samples:
-
-    $ ./runStandaloneSystemML.sh scripts/utils/sample.dml -nvargs X=data/haberman.data sv=data/perc.csv O=data/haberman.part ofmt="csv"
-
-
-## Splitting Labels from Features
-
-Next we use the `splitXY.dml` algorithm to separate the feature columns from
-the label column(s).
-
-Parameters:
-
- * `X`       : (input)  filename of data matrix
- * `y`       : (input)  colIndex: starting index is 1
- * `OX`      : (output) filename of output matrix with all columns except y
- * `OY`      : (output) filename of output matrix with y column
- * `ofmt`    : (output) format of OX and OY output matrix: "csv", "binary" (default)
-
-We specify `y=4` as the 4th column contains the labels to be predicted and run
-the `splitXY.dml` algorithm on our training and test data sets.
-
-    $ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/1 y=4 OX=data/haberman.train.data.csv OY=data/haberman.train.labels.csv ofmt="csv"
-
-    $ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/2 y=4 OX=data/haberman.test.data.csv  OY=data/haberman.test.labels.csv  ofmt="csv"
-
-## Training and Testing the Model
-
-Now we need to train our model using the `l2-svm.dml` algorithm.
-
-[Parameters](algorithms-classification.html#arguments-1):
-
- * `X`         : (input)  filename of training data features
- * `Y`         : (input)  filename of training data labels
- * `model`     : (output) filename of model that contains the learnt weights
- * `fmt`       : (output) format of model: "csv", "text" (sparse-matrix)
- * `Log`       : (output) log file for metrics and progress while training
- * `confusion` : (output) filename of confusion matrix computed using a held-out test set (optional)
-
-The `l2-svm.dml` algorithm is used on our training data sample to train the model.
-
-    $ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm.dml -nvargs X=data/haberman.train.data.csv Y=data/haberman.train.labels.csv model=data/l2-svm-model.csv fmt="csv" Log=data/l2-svm-log.csv
-
-The `l2-svm-predict.dml` algorithm is used on our test data sample to predict the labels based on the trained model.
-
-    $ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm-predict.dml -nvargs X=data/haberman.test.data.csv Y=data/haberman.test.labels.csv model=data/l2-svm-model.csv fmt="csv" confusion=data/l2-svm-confusion.csv
-
-The console output should show the accuracy of the trained model in percent, i.e.:
-
-    15/09/01 01:32:51 INFO api.DMLScript: BEGIN DML run 09/01/2015 01:32:51
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating localtmpdir with value /tmp/systemml
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating scratch with value scratch_space
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating optlevel with value 2
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating numreducers with value 10
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating jvmreuse with value false
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating defaultblocksize with value 1000
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.appmaster with value false
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.appmaster.mem with value 2048
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.mapreduce.mem with value 2048
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.app.queue with value default
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating cp.parallel.matrixmult with value true
-    15/09/01 01:32:51 INFO conf.DMLConfig: Updating cp.parallel.textio with value true
-    Accuracy (%): 74.14965986394557
-    15/09/01 01:32:52 INFO api.DMLScript: SystemML Statistics:
-    Total execution time:		0.130 sec.
-    Number of executed MR Jobs:	0.
-
-The generated file `l2-svm-confusion.csv` should contain the following confusion matrix of this form:
-
-    |0   1.0 2.0|
-    |1.0 t1  t2 |
-    |2.0 t3  t4 |
-
- * The model correctly predicted label 1 `t1` times
- * The model incorrectly predicted label 1 as opposed to label 2 `t2` times
- * The model incorrectly predicted label 2 as opposed to label 1 `t3` times
- * The model correctly predicted label 2 `t4` times.
-
-If the confusion matrix looks like this ...
-
-    0,1.0,2.0
-    1.0,107.0,38.0
-    2.0,0.0,2.0
-
-... then the accuracy of the model is (t1+t4)/(t1+t2+t3+t4) = (107+2)/107+38+0+2) = 0.741496599
-
-<br/>
-
-Refer to the [Algorithms Reference](algorithms-reference.html) for more details.
-
-<br/>
-
-# Troubleshooting
-
-If you encounter a `"java.lang.OutOfMemoryError"` you can edit the invocation
-script (`runStandaloneSystemML.sh` or `runStandaloneSystemML.bat`) to increase
-the memory available to the JVM, i.e:
-
-    java -Xmx16g -Xms4g -Xmn1g -cp ${CLASSPATH} org.apache.sysml.api.DMLScript \
-         -f ${SCRIPT_FILE} -exec singlenode -config=SystemML-config.xml \
-         $@
-
-<br/>
-
-`this is code`
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/release-process.md
----------------------------------------------------------------------
diff --git a/docs/release-process.md b/docs/release-process.md
index ded33e9..ed78a32 100644
--- a/docs/release-process.md
+++ b/docs/release-process.md
@@ -84,23 +84,23 @@ this OS X example.
 
 	# download artifacts
 	wget -r -nH -nd -np -R index.html* https://dist.apache.org/repos/dist/dev/incubator/systemml/0.10.0-incubating-rc1/
-	
+
 	# verify standalone tar.gz works
 	tar -xvzf systemml-0.10.0-incubating-standalone.tar.gz
 	cd systemml-0.10.0-incubating-standalone
 	echo "print('hello world');" > hello.dml
 	./runStandaloneSystemML.sh hello.dml
 	cd ..
-	
+
 	# verify main jar works
 	mkdir lib
 	cp -R systemml-0.10.0-incubating-standalone/lib/* lib/
 	rm lib/systemml-0.10.0-incubating.jar
 	java -cp ./lib/*:systemml-0.10.0-incubating.jar org.apache.sysml.api.DMLScript -s "print('hello world');"
-	
+
 	# verify standalone jar works
 	java -jar systemml-0.10.0-incubating-standalone.jar -s "print('hello world');"
-	
+
 	# verify src works
 	tar -xvzf systemml-0.10.0-incubating-src.tar.gz
 	cd systemml-0.10.0-incubating-src
@@ -111,21 +111,21 @@ this OS X example.
 	java -jar systemml-0.10.0-incubating-standalone.jar -s "print('hello world');"
 	cd ..
 	cd ..
-	
+
 	# verify in-memory jar works
 	echo "import org.apache.sysml.api.jmlc.*;public class JMLCEx {public static void main(String[] args) throws Exception {Connection conn = new Connection();PreparedScript script = conn.prepareScript(\"print('hello world');\", new String[]{}, new String[]{}, false);script.executeScript();}}" > JMLCEx.java
 	javac -cp systemml-0.10.0-incubating-inmemory.jar JMLCEx.java
 	java -cp .:systemml-0.10.0-incubating-inmemory.jar JMLCEx
-	
+
 	# verify distrib tar.gz works
 	tar -xvzf systemml-0.10.0-incubating.tar.gz
 	cd systemml-0.10.0-incubating
 	java -cp ../lib/*:SystemML.jar org.apache.sysml.api.DMLScript -s "print('hello world');"
-	
+
 	# verify spark batch mode
 	export SPARK_HOME=/Users/deroneriksson/spark-1.5.1-bin-hadoop2.6
 	$SPARK_HOME/bin/spark-submit SystemML.jar -s "print('hello world');" -exec hybrid_spark
-	
+
 	# verify hadoop batch mode
 	hadoop jar SystemML.jar -s "print('hello world');"
 
@@ -135,18 +135,18 @@ sanity check on OS X after building the artifacts manually.
 
 	# build distribution artifacts
 	mvn clean package -P distribution
-	
+
 	cd target
-	
+
 	# verify main jar works
 	java -cp ./lib/*:systemml-0.10.0-incubating.jar org.apache.sysml.api.DMLScript -s "print('hello world');"
-	
+
 	# verify SystemML.jar works
 	java -cp ./lib/*:SystemML.jar org.apache.sysml.api.DMLScript -s "print('hello world');"
-	
+
 	# verify standalone jar works
 	java -jar systemml-0.10.0-incubating-standalone.jar -s "print('hello world');"
-	
+
 	# verify src works
 	tar -xvzf systemml-0.10.0-incubating-src.tar.gz
 	cd systemml-0.10.0-incubating-src
@@ -157,28 +157,28 @@ sanity check on OS X after building the artifacts manually.
 	java -jar systemml-0.10.0-incubating-standalone.jar -s "print('hello world');"
 	cd ..
 	cd ..
-	
+
 	# verify in-memory jar works
 	echo "import org.apache.sysml.api.jmlc.*;public class JMLCEx {public static void main(String[] args) throws Exception {Connection conn = new Connection();PreparedScript script = conn.prepareScript(\"print('hello world');\", new String[]{}, new String[]{}, false);script.executeScript();}}" > JMLCEx.java
 	javac -cp systemml-0.10.0-incubating-inmemory.jar JMLCEx.java
 	java -cp .:systemml-0.10.0-incubating-inmemory.jar JMLCEx
-	
+
 	# verify standalone tar.gz works
 	tar -xvzf systemml-0.10.0-incubating-standalone.tar.gz
 	cd systemml-0.10.0-incubating-standalone
 	echo "print('hello world');" > hello.dml
 	./runStandaloneSystemML.sh hello.dml
 	cd ..
-	
+
 	# verify distrib tar.gz works
 	tar -xvzf systemml-0.10.0-incubating.tar.gz
 	cd systemml-0.10.0-incubating
 	java -cp ../lib/*:SystemML.jar org.apache.sysml.api.DMLScript -s "print('hello world');"
-	
+
 	# verify spark batch mode
 	export SPARK_HOME=/Users/deroneriksson/spark-1.5.1-bin-hadoop2.6
 	$SPARK_HOME/bin/spark-submit SystemML.jar -s "print('hello world');" -exec hybrid_spark
-	
+
 	# verify hadoop batch mode
 	hadoop jar SystemML.jar -s "print('hello world');"
 
@@ -222,7 +222,7 @@ The standalone tar.gz and zip artifacts contain `runStandaloneSystemML.sh` and `
 files. Verify that one or more algorithms can be run on a single node using these
 standalone distributions.
 
-Here is an example based on the [Quick Start Guide](http://apache.github.io/incubator-systemml/quick-start-guide.html)
+Here is an example based on the [Standalone Guide](http://apache.github.io/incubator-systemml/standalone-guide.html)
 demonstrating the execution of an algorithm (on OS X).
 
 	$ tar -xvzf systemml-0.10.0-incubating-standalone.tar.gz
@@ -276,7 +276,3 @@ For examples, see the [Spark MLContext Programming Guide](http://apache.github.i
 
 Verify that the performance suite located at scripts/perftest/ executes on Spark and Hadoop. Testing should
 include 80MB, 800MB, 8GB, and 80GB data sizes.
-
-
-
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/spark-batch-mode.md
----------------------------------------------------------------------
diff --git a/docs/spark-batch-mode.md b/docs/spark-batch-mode.md
new file mode 100644
index 0000000..c199b1f
--- /dev/null
+++ b/docs/spark-batch-mode.md
@@ -0,0 +1,84 @@
+---
+layout: global
+title: Invoking SystemML in Spark Batch Mode
+description: Invoking SystemML in Spark Batch Mode
+---
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+<br/>
+
+
+# Overview
+
+Given that a primary purpose of SystemML is to perform machine learning on large distributed data
+sets, one of the most important ways to invoke SystemML is Spark Batch. Here, we will look at this
+mode in more depth.
+
+**NOTE:** For a programmatic API to run and interact with SystemML via Scala or Python, please see the
+[Spark MLContext Programming Guide](spark-mlcontext-programming-guide).
+
+---
+
+# Spark Batch Mode Invocation Syntax
+
+SystemML can be invoked in Hadoop Batch mode using the following syntax:
+
+    spark-submit SystemML.jar [-? | -help | -f <filename>] (-config=<config_filename>) ([-args | -nvargs] <args-list>)
+
+The DML script to invoke is specified after the `-f` argument. Configuration settings can be passed to SystemML
+using the optional `-config=` argument. DML scripts can optionally take named arguments (`-nvargs`) or positional
+arguments (`-args`). Named arguments are preferred over positional arguments. Positional arguments are considered
+to be deprecated. All the primary algorithm scripts included with SystemML use named arguments.
+
+
+**Example #1: DML Invocation with Named Arguments**
+
+    spark-submit systemml/SystemML.jar -f systemml/algorithms/Kmeans.dml -nvargs X=X.mtx k=5
+
+
+**Example #2: DML Invocation with Positional Arguments**
+
+	spark-submit systemml/SystemML.jar -f example/test/LinearRegression.dml -args "v" "y" 0.00000001 "w"
+
+# Execution modes
+
+SystemML works seamlessly with all Spark execution modes, including *local* (`--master local[*]`),
+*yarn client* (`--master yarn-client`), *yarn cluster* (`--master yarn-cluster`), *etc*.  More
+information on Spark cluster execution modes can be found on the
+[official Spark cluster deployment documentation](https://spark.apache.org/docs/latest/cluster-overview.html).
+*Note* that Spark can be easily run on a laptop in local mode using the `--master local[*]` described
+above, which SystemML supports.
+
+# Recommended Spark Configuration Settings
+
+For best performance, we recommend setting the following flags when running SystemML with Spark:
+`--conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128`.
+
+# Examples
+
+Please see the MNIST examples in the included
+[SystemML-NN](https://github.com/apache/incubator-systemml/tree/master/scripts/staging/SystemML-NN)
+library for examples of Spark Batch mode execution with SystemML to train MNIST classifiers:
+
+  * [MNIST Softmax Classifier](https://github.com/apache/incubator-systemml/blob/master/scripts/staging/SystemML-NN/examples/mnist_softmax-train.dml)
+  * [MNIST LeNet ConvNet](https://github.com/apache/incubator-systemml/blob/master/scripts/staging/SystemML-NN/examples/mnist_lenet-train.dml)