You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/09/30 22:45:34 UTC
incubator-systemml git commit: [SYSTEMML-998] Overview page as a
design document for developers
Repository: incubator-systemml
Updated Branches:
refs/heads/master 53ba37ecc -> c7d990ce6
[SYSTEMML-998] Overview page as a design document for developers
Also updated the documentation of Python DSL and mllearn.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c7d990ce
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c7d990ce
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c7d990ce
Branch: refs/heads/master
Commit: c7d990ce6f94b5ceae2ed37e62b66164cb15b8c8
Parents: 53ba37e
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Fri Sep 30 15:42:54 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Fri Sep 30 15:42:54 2016 -0700
----------------------------------------------------------------------
src/main/javadoc/overview.html | 99 +++++++++++++
src/main/python/systemml/defmatrix.py | 108 +++++++++------
src/main/python/systemml/mllearn/estimators.py | 146 +++++++++++++++++++-
3 files changed, 308 insertions(+), 45 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c7d990ce/src/main/javadoc/overview.html
----------------------------------------------------------------------
diff --git a/src/main/javadoc/overview.html b/src/main/javadoc/overview.html
new file mode 100644
index 0000000..dc643b1
--- /dev/null
+++ b/src/main/javadoc/overview.html
@@ -0,0 +1,99 @@
+<!--
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+-->
+
+<!--
+`mvn javadoc:javadoc` will compile the javadocs in the folder target/site/apidocs
+-->
+<html>
+<body>
+<h1>SystemML Architecture</h1>
+Algorithms in Apache SystemML are written in a high-level R-like language called Declarative Machine learning Language (DML)
+or a high-level Python-like language called PyDML.
+SystemML compiles and optimizes these algorithms into hybrid runtime
+plans of multi-threaded, in-memory operations on a single node (scale-up) and distributed MR or Spark operations on
+a cluster of nodes (scale-out). SystemML's high-level architecture consists of the following components:
+
+<h2>Language</h2>
+DML (with either R- or Python-like syntax) provides linear algebra primitives, a rich set of statistical
+functions and matrix manipulations,
+as well as user-defined and external functions, control structures including parfor loops, and recursion.
+The user provides the DML script through one of the following APIs:
+<ul>
+ <li>Command-line interface ( {@see org.apache.sysml.api.DMLScript} )</li>
+ <li>Convenient programmatic interface for Spark users ( {@see org.apache.sysml.api.mlcontext.MLContext} )</li>
+ <li>Java Machine Learning Connector API ( {@see org.apache.sysml.api.jmlc.Connection} )</li>
+</ul>
+
+{@see org.apache.sysml.parser.AParserWrapper} performs syntatic validation and
+parses the input DML script using ANTLR into a
+a hierarchy of {@see org.apache.sysml.parser.StatementBlock} and
+{@see org.apache.sysml.parser.Statement} as defined by control structures.
+Another important class of the language component is {@see org.apache.sysml.parser.DMLTranslator}
+which performs live variable analysis and semantic validation.
+During that process we also retrieve input data characteristics -- i.e., format,
+number of rows, columns, and non-zero values -- as well as
+infrastructure characteristics, which are used for subsequent
+optimizations. Finally, we construct directed acyclic graphs (DAGs)
+of high-level operators ( {@see org.apache.sysml.hops.Hop} ) per statement block.
+
+<h2>Optimizer</h2>
+The SystemML optimizer works over programs of HOP DAGs, where HOPs are operators on
+matrices or scalars, and are categorized according to their
+access patterns. Examples are matrix multiplications, unary
+aggregates like rowSums(), binary operations like cell-wise
+matrix additions, reorganization operations like transpose or
+sort, and more specific operations. We perform various optimizations
+on these HOP DAGs, including algebraic simplification rewrites ( {@see org.apache.sysml.hops.rewrite.ProgramRewriter} ),
+intra-/{@see org.apache.sysml.hops.ipa.InterProceduralAnalysis}
+for statistics propagation into functions and over entire programs, and
+operator ordering of matrix multiplication chains. We compute
+memory estimates for all HOPs, reflecting the memory
+requirements of in-memory single-node operations and
+intermediates. Each HOP DAG is compiled to a DAG of
+low-level operators ( {@see org.apache.sysml.lops.Lop} ) such as grouping and aggregate,
+which are backend-specific physical operators. Operator selection
+picks the best physical operators for a given HOP
+based on memory estimates, data, and cluster characteristics.
+Individual LOPs have corresponding runtime implementations,
+called instructions, and the optimizer generates
+an executable runtime program of instructions.
+
+<h2>Runtime</h2>
+We execute the generated runtime program locally
+in CP (control program), i.e., within a driver process.
+This driver handles recompilation, runs in-memory singlenode
+{@see org.apache.sysml.runtime.instructions.cp.CPInstruction} (some of which are multi-threaded ),
+maintains an in-memory buffer pool, and launches MR or
+Spark jobs if the runtime plan contains distributed computations
+in the form of {@see org.apache.sysml.runtime.instructions.mr.MRInstruction}
+or Spark instructions ( {@see org.apache.sysml.runtime.instructions.spark.SPInstruction} ).
+For the MR backend, the SystemML compiler groups LOPs --
+and thus, MR instructions -- into a minimal number of MR
+jobs (MR-job instructions). This procedure is referred to as piggybacking ( {@see org.apache.sysml.lops.compile.Dag} )
+For the Spark backend, we rely on Spark's lazy evaluation and stage construction.
+CP instructions may also be backed by GPU kernels ( {@see org.apache.sysml.runtime.instructions.gpu.GPUInstruction} ).
+The multi-level buffer pool caches local matrices in-memory,
+evicts them if necessary, and handles data exchange between
+local and distributed runtime backends.
+The core of SystemML's runtime instructions is an adaptive matrix block library,
+which is sparsity-aware and operates on the entire matrix in CP, or blocks of a matrix in a distributed setting. Further
+key features include parallel for-loops for task-parallel
+computations, and dynamic recompilation for runtime plan adaptation addressing initial unknowns.
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c7d990ce/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
index 50abc0a..845a2ed 100644
--- a/src/main/python/systemml/defmatrix.py
+++ b/src/main/python/systemml/defmatrix.py
@@ -309,41 +309,24 @@ def eval(outputs, outputDF=False, execute=True):
###############################################################################
-# DESIGN DECISIONS:
-# 1. Until eval() method is invoked, we create an AST (not exposed to the user) that consist of unevaluated operations and data required by those operations.
-# As an anology, a spark user can treat eval() method similar to calling RDD.persist() followed by RDD.count().
-# 2. The AST consist of two kinds of nodes: either of type matrix or of type DMLOp.
-# Both these classes expose _visit method, that helps in traversing the AST in DFS manner.
-# 3. A matrix object can either be evaluated or not.
-# If evaluated, the attribute 'data' is set to one of the supported types (for example: NumPy array or DataFrame). In this case, the attribute 'op' is set to None.
-# If not evaluated, the attribute 'op' which refers to one of the intermediate node of AST and if of type DMLOp. In this case, the attribute 'data' is set to None.
-# 5. DMLOp has an attribute 'inputs' which contains list of matrix objects or DMLOp.
-# 6. To simplify the traversal, every matrix object is considered immutable and an matrix operations creates a new matrix object.
-# As an example:
-# - m1 = sml.matrix(np.ones((3,3))) creates a matrix object backed by 'data=(np.ones((3,3))'.
-# - m1 = m1 * 2 will create a new matrix object which is now backed by 'op=DMLOp( ... )' whose input is earlier created matrix object.
-# 7. Left indexing (implemented in __setitem__ method) is a special case, where Python expects the existing object to be mutated.
-# To ensure the above property, we make deep copy of existing object and point any references to the left-indexed matrix to the newly created object.
-# Then the left-indexed matrix is set to be backed by DMLOp consisting of following pydml:
-# left-indexed-matrix = new-deep-copied-matrix
-# left-indexed-matrix[index] = value
-# 8. Please use m.printAST() and/or type `m` for debugging. Here is a sample session:
-# >>> npm = np.ones((3,3))
-# >>> m1 = sml.matrix(npm + 3)
-# >>> m2 = sml.matrix(npm + 5)
-# >>> m3 = m1 + m2
-# >>> m3
-# mVar2 = load(" ", format="csv")
-# mVar1 = load(" ", format="csv")
-# mVar3 = mVar1 + mVar2
-# save(mVar3, " ")
-# >>> m3.printAST()
-# - [mVar3] (op).
-# - [mVar1] (data).
-# - [mVar2] (data).
class matrix(object):
"""
- matrix class is a python wrapper that implements basic matrix operator.
+ matrix class is a python wrapper that implements basic matrix operators, matrix functions
+ as well as converters to common Python types (for example: Numpy arrays, PySpark DataFrame
+ and Pandas DataFrame).
+
+ The operators supported are:
+
+ 1. Arithmetic operators: +, -, *, /, //, %, ** as well as dot (i.e. matrix multiplication)
+ 2. Indexing in the matrix
+ 3. Relational/Boolean operators: <, <=, >, >=, ==, !=, &, |
+
+ In addition, following functions are supported for matrix:
+
+ 1. transpose
+ 2. Aggregation functions: sum, mean, max, min, argmin, argmax, cumsum
+ 3. Global statistical built-In functions: exp, log, abs, sqrt, round, floor, ceil, sin, cos, tan, asin, acos, atan, sign, solve
+
Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
Examples
@@ -366,23 +349,54 @@ class matrix(object):
mVar4 = mVar1 * mVar3
mVar5 = 1.0 - mVar4
save(mVar5, " ")
-
- <SystemML.defmatrix.matrix object>
>>> m2.eval()
>>> m2
# This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.
- <SystemML.defmatrix.matrix object>
>>> m4
# This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
mVar4 = load(" ", format="csv")
mVar5 = 1.0 - mVar4
save(mVar5, " ")
-
- <SystemML.defmatrix.matrix object>
>>> m4.sum(axis=1).toNumPy()
array([[-60.],
[-60.],
[-60.]])
+
+ Design Decisions:
+
+ 1. Until eval() method is invoked, we create an AST (not exposed to the user) that consist of unevaluated operations and data required by those operations.
+ As an anology, a spark user can treat eval() method similar to calling RDD.persist() followed by RDD.count().
+ 2. The AST consist of two kinds of nodes: either of type matrix or of type DMLOp.
+ Both these classes expose _visit method, that helps in traversing the AST in DFS manner.
+ 3. A matrix object can either be evaluated or not.
+ If evaluated, the attribute 'data' is set to one of the supported types (for example: NumPy array or DataFrame). In this case, the attribute 'op' is set to None.
+ If not evaluated, the attribute 'op' which refers to one of the intermediate node of AST and if of type DMLOp. In this case, the attribute 'data' is set to None.
+ 5. DMLOp has an attribute 'inputs' which contains list of matrix objects or DMLOp.
+ 6. To simplify the traversal, every matrix object is considered immutable and an matrix operations creates a new matrix object.
+ As an example:
+ `m1 = sml.matrix(np.ones((3,3)))` creates a matrix object backed by 'data=(np.ones((3,3))'.
+ `m1 = m1 * 2` will create a new matrix object which is now backed by 'op=DMLOp( ... )' whose input is earlier created matrix object.
+ 7. Left indexing (implemented in __setitem__ method) is a special case, where Python expects the existing object to be mutated.
+ To ensure the above property, we make deep copy of existing object and point any references to the left-indexed matrix to the newly created object.
+ Then the left-indexed matrix is set to be backed by DMLOp consisting of following pydml:
+ left-indexed-matrix = new-deep-copied-matrix
+ left-indexed-matrix[index] = value
+ 8. Please use m.printAST() and/or type `m` for debugging. Here is a sample session:
+
+ >>> npm = np.ones((3,3))
+ >>> m1 = sml.matrix(npm + 3)
+ >>> m2 = sml.matrix(npm + 5)
+ >>> m3 = m1 + m2
+ >>> m3
+ mVar2 = load(" ", format="csv")
+ mVar1 = load(" ", format="csv")
+ mVar3 = mVar1 + mVar2
+ save(mVar3, " ")
+ >>> m3.printAST()
+ - [mVar3] (op).
+ - [mVar1] (data).
+ - [mVar2] (data).
+
"""
# Global variable that is used to keep track of intermediate matrix variables in the DML script
systemmlVarID = 0
@@ -502,7 +516,21 @@ class matrix(object):
def printAST(self, numSpaces = 0):
"""
- Used for debugging purposes
+ Please use m.printAST() and/or type `m` for debugging. Here is a sample session:
+
+ >>> npm = np.ones((3,3))
+ >>> m1 = sml.matrix(npm + 3)
+ >>> m2 = sml.matrix(npm + 5)
+ >>> m3 = m1 + m2
+ >>> m3
+ mVar2 = load(" ", format="csv")
+ mVar1 = load(" ", format="csv")
+ mVar3 = mVar1 + mVar2
+ save(mVar3, " ")
+ >>> m3.printAST()
+ - [mVar3] (op).
+ - [mVar1] (data).
+ - [mVar2] (data).
"""
head = ''.join([ ' ' ]*numSpaces + [ '- [', self.ID, '] ' ])
if self.data is not None:
@@ -529,7 +557,7 @@ class matrix(object):
print('# This matrix (' + self.ID + ') is backed by PySpark DataFrame. To fetch the DataFrame, invoke toDataFrame() method.')
else:
print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.')
- return '<SystemML.defmatrix.matrix object>'
+ return ''
######################### Arithmetic operators ######################################
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c7d990ce/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
index 82e0b2c..35a0f96 100644
--- a/src/main/python/systemml/mllearn/estimators.py
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -180,10 +180,69 @@ class BaseSystemMLRegressor(BaseSystemMLEstimator):
class LogisticRegression(BaseSystemMLClassifier):
+ """
+ Performs both binomial and multinomial logistic regression.
+
+ Examples
+ --------
+
+ Scikit-learn way
+
+ >>> from sklearn import datasets, neighbors
+ >>> from systemml.mllearn import LogisticRegression
+ >>> from pyspark.sql import SQLContext
+ >>> sqlCtx = SQLContext(sc)
+ >>> digits = datasets.load_digits()
+ >>> X_digits = digits.data
+ >>> y_digits = digits.target + 1
+ >>> n_samples = len(X_digits)
+ >>> X_train = X_digits[:.9 * n_samples]
+ >>> y_train = y_digits[:.9 * n_samples]
+ >>> X_test = X_digits[.9 * n_samples:]
+ >>> y_test = y_digits[.9 * n_samples:]
+ >>> logistic = LogisticRegression(sqlCtx)
+ >>> print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
+
+ MLPipeline way
+
+ >>> from pyspark.ml import Pipeline
+ >>> from systemml.mllearn import LogisticRegression
+ >>> from pyspark.ml.feature import HashingTF, Tokenizer
+ >>> from pyspark.sql import SQLContext
+ >>> sqlCtx = SQLContext(sc)
+ >>> training = sqlCtx.createDataFrame([
+ >>> (0L, "a b c d e spark", 1.0),
+ >>> (1L, "b d", 2.0),
+ >>> (2L, "spark f g h", 1.0),
+ >>> (3L, "hadoop mapreduce", 2.0),
+ >>> (4L, "b spark who", 1.0),
+ >>> (5L, "g d a y", 2.0),
+ >>> (6L, "spark fly", 1.0),
+ >>> (7L, "was mapreduce", 2.0),
+ >>> (8L, "e spark program", 1.0),
+ >>> (9L, "a e c l", 2.0),
+ >>> (10L, "spark compile", 1.0),
+ >>> (11L, "hadoop software", 2.0)
+ >>> ], ["id", "text", "label"])
+ >>> tokenizer = Tokenizer(inputCol="text", outputCol="words")
+ >>> hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
+ >>> lr = LogisticRegression(sqlCtx)
+ >>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
+ >>> model = pipeline.fit(training)
+ >>> test = sqlCtx.createDataFrame([
+ >>> (12L, "spark i j k"),
+ >>> (13L, "l m n"),
+ >>> (14L, "mapreduce spark"),
+ >>> (15L, "apache hadoop")], ["id", "text"])
+ >>> prediction = model.transform(test)
+ >>> prediction.show()
+
+ """
+
def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
"""
Performs both binomial and multinomial logistic regression.
-
+
Parameters
----------
sqlCtx: PySpark SQLContext
@@ -216,10 +275,39 @@ class LogisticRegression(BaseSystemMLClassifier):
class LinearRegression(BaseSystemMLRegressor):
-
+ """
+ Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.
+
+ Examples
+ --------
+
+ >>> import numpy as np
+ >>> from sklearn import datasets
+ >>> from systemml.mllearn import LinearRegression
+ >>> from pyspark.sql import SQLContext
+ >>> # Load the diabetes dataset
+ >>> diabetes = datasets.load_diabetes()
+ >>> # Use only one feature
+ >>> diabetes_X = diabetes.data[:, np.newaxis, 2]
+ >>> # Split the data into training/testing sets
+ >>> diabetes_X_train = diabetes_X[:-20]
+ >>> diabetes_X_test = diabetes_X[-20:]
+ >>> # Split the targets into training/testing sets
+ >>> diabetes_y_train = diabetes.target[:-20]
+ >>> diabetes_y_test = diabetes.target[-20:]
+ >>> # Create linear regression object
+ >>> regr = LinearRegression(sqlCtx, solver='newton-cg')
+ >>> # Train the model using the training sets
+ >>> regr.fit(diabetes_X_train, diabetes_y_train)
+ >>> # The mean square error
+ >>> print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
+
+ """
+
+
def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
"""
- Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables..
+ Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.
Parameters
----------
@@ -252,6 +340,29 @@ class LinearRegression(BaseSystemMLRegressor):
class SVM(BaseSystemMLClassifier):
+ """
+ Performs both binary-class and multiclass SVM (Support Vector Machines).
+
+ Examples
+ --------
+
+ >>> from sklearn import datasets, neighbors
+ >>> from systemml.mllearn import SVM
+ >>> from pyspark.sql import SQLContext
+ >>> sqlCtx = SQLContext(sc)
+ >>> digits = datasets.load_digits()
+ >>> X_digits = digits.data
+ >>> y_digits = digits.target
+ >>> n_samples = len(X_digits)
+ >>> X_train = X_digits[:.9 * n_samples]
+ >>> y_train = y_digits[:.9 * n_samples]
+ >>> X_test = X_digits[.9 * n_samples:]
+ >>> y_test = y_digits[.9 * n_samples:]
+ >>> svm = SVM(sqlCtx, is_multi_class=True)
+ >>> print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))
+
+ """
+
def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
"""
@@ -282,10 +393,35 @@ class SVM(BaseSystemMLClassifier):
class NaiveBayes(BaseSystemMLClassifier):
-
+ """
+ Performs Naive Bayes.
+
+ Examples
+ --------
+
+ >>> from sklearn.datasets import fetch_20newsgroups
+ >>> from sklearn.feature_extraction.text import TfidfVectorizer
+ >>> from systemml.mllearn import NaiveBayes
+ >>> from sklearn import metrics
+ >>> from pyspark.sql import SQLContext
+ >>> sqlCtx = SQLContext(sc)
+ >>> categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
+ >>> newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
+ >>> newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
+ >>> vectorizer = TfidfVectorizer()
+ >>> # Both vectors and vectors_test are SciPy CSR matrix
+ >>> vectors = vectorizer.fit_transform(newsgroups_train.data)
+ >>> vectors_test = vectorizer.transform(newsgroups_test.data)
+ >>> nb = NaiveBayes(sqlCtx)
+ >>> nb.fit(vectors, newsgroups_train.target)
+ >>> pred = nb.predict(vectors_test)
+ >>> metrics.f1_score(newsgroups_test.target, pred, average='weighted')
+
+ """
+
def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
"""
- Performs both binary-class and multiclass SVM (Support Vector Machines).
+ Performs Naive Bayes.
Parameters
----------