You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/09/02 00:10:52 UTC

[1/2] incubator-systemml git commit: [SYSTEMML-878] Update the Python package from SystemML to systemml

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 7610a21db -> 542de374e


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mlcontext.py b/src/main/python/systemml/mlcontext.py
new file mode 100644
index 0000000..1b90e70
--- /dev/null
+++ b/src/main/python/systemml/mlcontext.py
@@ -0,0 +1,302 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+import os
+
+try:
+    import py4j.java_gateway
+    from py4j.java_gateway import JavaObject
+except ImportError:
+    raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark')
+
+from pyspark import SparkContext
+import pyspark.mllib.common
+from pyspark.sql import DataFrame, SQLContext
+from .converters import *
+
+def dml(scriptString):
+    """
+    Create a dml script object based on a string.
+
+    Parameters
+    ----------
+    scriptString: string
+        Can be a path to a dml script or a dml script itself.
+
+    Returns
+    -------
+    script: Script instance
+        Instance of a script object.
+    """
+    if not isinstance(scriptString, str):
+        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
+    return Script(scriptString, scriptType="dml")
+
+
+def pydml(scriptString):
+    """
+    Create a pydml script object based on a string.
+
+    Parameters
+    ----------
+    scriptString: string
+        Can be a path to a pydml script or a pydml script itself.
+
+    Returns
+    -------
+    script: Script instance
+        Instance of a script object.
+    """
+    if not isinstance(scriptString, str):
+        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
+    return Script(scriptString, scriptType="pydml")
+
+
+def _java2py(sc, obj):
+    """ Convert Java object to Python. """
+    # TODO: Port this private PySpark function.
+    obj = pyspark.mllib.common._java2py(sc, obj)
+    if isinstance(obj, JavaObject):
+        class_name = obj.getClass().getSimpleName()
+        if class_name == 'Matrix':
+            obj = Matrix(obj, sc)
+    return obj
+
+
+def _py2java(sc, obj):
+    """ Convert Python object to Java. """
+    if isinstance(obj, Matrix):
+        obj = obj._java_matrix
+    # TODO: Port this private PySpark function.
+    obj = pyspark.mllib.common._py2java(sc, obj)
+    return obj
+
+
+class Matrix(object):
+    """
+    Wrapper around a Java Matrix object.
+
+    Parameters
+    ----------
+    javaMatrix: JavaObject
+        A Java Matrix object as returned by calling `ml.execute().get()`.
+
+    sc: SparkContext
+        SparkContext
+    """
+    def __init__(self, javaMatrix, sc):
+        self._java_matrix = javaMatrix
+        self.sc = sc
+
+    def __repr__(self):
+        return "Matrix"
+
+    def toDF(self):
+        """
+        Convert the Matrix to a PySpark SQL DataFrame.
+
+        Returns
+        -------
+        df: PySpark SQL DataFrame
+            A PySpark SQL DataFrame representing the matrix, with
+            one "ID" column containing the row index (since Spark
+            DataFrames are unordered), followed by columns of doubles
+            for each column in the matrix.
+        """
+        jdf = self._java_matrix.asDataFrame()
+        df = _java2py(self.sc, jdf)
+        return df
+
+
+class MLResults(object):
+    """
+    Wrapper around a Java ML Results object.
+
+    Parameters
+    ----------
+    results: JavaObject
+        A Java MLResults object as returned by calling `ml.execute()`.
+
+    sc: SparkContext
+        SparkContext
+    """
+    def __init__(self, results, sc):
+        self._java_results = results
+        self.sc = sc
+        try:
+            if MLResults.sqlContext is None:
+                MLResults.sqlContext = SQLContext(sc)
+        except AttributeError:
+            MLResults.sqlContext = SQLContext(sc)
+
+    def __repr__(self):
+        return "MLResults"
+
+    def getNumPyArray(self, *outputs):
+        """
+        Parameters
+        ----------
+        outputs: string, list of strings
+            Output variables as defined inside the DML script.
+        """
+        outs = [convertToNumpyArr(self.sc, self._java_results.getMatrix(out).asBinaryBlockMatrix().getMatrixBlock()) for out in outputs]
+        if len(outs) == 1:
+            return outs[0]
+        return outs
+
+    def getDataFrame(self, *outputs):
+        """
+        Parameters
+        ----------
+        outputs: string, list of strings
+            Output variables as defined inside the DML script.
+        """
+        outs = [DataFrame(self._java_results.getDataFrame(out), MLResults.sqlContext) for out in outputs]
+        if len(outs) == 1:
+            return outs[0]
+        return outs
+
+    def get(self, *outputs):
+        """
+        Parameters
+        ----------
+        outputs: string, list of strings
+            Output variables as defined inside the DML script.
+        """
+        outs = [_java2py(self.sc, self._java_results.get(out)) for out in outputs]
+        if len(outs) == 1:
+            return outs[0]
+        return outs
+
+
+class Script(object):
+    """
+    Instance of a DML/PyDML Script.
+
+    Parameters
+    ----------
+    scriptString: string
+        Can be either a file path to a DML script or a DML script itself.
+
+    scriptType: string
+        Script language, either "dml" for DML (R-like) or "pydml" for PyDML (Python-like).
+    """
+    def __init__(self, scriptString, scriptType="dml"):
+        self.scriptString = scriptString
+        self.scriptType = scriptType
+        self._input = {}
+        self._output = []
+
+    def input(self, *args, **kwargs):
+        """
+        Parameters
+        ----------
+        args: name, value tuple
+            where name is a string, and currently supported value formats
+            are double, string, dataframe, rdd, and list of such object.
+
+        kwargs: dict of name, value pairs
+            To know what formats are supported for name and value, look above.
+        """
+        if args and len(args) != 2:
+            raise ValueError("Expected name, value pair.")
+        elif args:
+            self._input[args[0]] = args[1]
+        for name, value in kwargs.items():
+            self._input[name] = value
+        return self
+
+    def output(self, *names):
+        """
+        Parameters
+        ----------
+        names: string, list of strings
+            Output variables as defined inside the DML script.
+        """
+        self._output.extend(names)
+        return self
+
+
+class MLContext(object):
+    """
+    Wrapper around the new SystemML MLContext.
+
+    Parameters
+    ----------
+    sc: SparkContext
+        SparkContext
+    """
+    def __init__(self, sc):
+        if not isinstance(sc, SparkContext):
+            raise ValueError("Expected sc to be a SparkContext, got " % sc)
+        self._sc = sc
+        self._ml = sc._jvm.org.apache.sysml.api.mlcontext.MLContext(sc._jsc)
+
+    def __repr__(self):
+        return "MLContext"
+
+    def execute(self, script):
+        """
+        Execute a DML / PyDML script.
+
+        Parameters
+        ----------
+        script: Script instance
+            Script instance defined with the appropriate input and output variables.
+
+        Returns
+        -------
+        ml_results: MLResults
+            MLResults instance.
+        """
+        if not isinstance(script, Script):
+            raise ValueError("Expected script to be an instance of Script")
+        scriptString = script.scriptString
+        if script.scriptType == "dml":
+            if scriptString.endswith(".dml"):
+                if os.path.exists(scriptString):
+                    script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dmlFromFile(scriptString)
+                else:
+                    raise ValueError("path: %s does not exist" % scriptString)
+            else:
+                script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dml(scriptString)
+        elif script.scriptType == "pydml":
+            if scriptString.endswith(".pydml"):
+                if os.path.exists(scriptString):
+                    script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydmlFromFile(scriptString)
+                else:
+                    raise ValueError("path: %s does not exist" % scriptString)
+            else:
+                script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString)
+
+        for key, val in script._input.items():
+            # `in` is a reserved word ("keyword") in Python, so `script_java.in(...)` is not
+            # allowed. Therefore, we use the following code in which we retrieve a function
+            # representing `script_java.in`, and then call it with the arguments.  This is in
+            # lieu of adding a new `input` method on the JVM side, as that would complicate use
+            # from Scala/Java.
+            py4j.java_gateway.get_method(script_java, "in")(key, _py2java(self._sc, val))
+        for val in script._output:
+            script_java.out(val)
+        return MLResults(self._ml.execute(script_java), self._sc)
+
+
+__all__ = ['MLResults', 'MLContext', 'Script', 'dml', 'pydml']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/mllearn/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/__init__.py b/src/main/python/systemml/mllearn/__init__.py
new file mode 100644
index 0000000..69cab58
--- /dev/null
+++ b/src/main/python/systemml/mllearn/__init__.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from .estimators import *
+
+__all__ = estimators.__all__
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
new file mode 100644
index 0000000..5d33d64
--- /dev/null
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -0,0 +1,302 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from pyspark.context import SparkContext 
+from pyspark.sql import DataFrame, SQLContext
+from pyspark.rdd import RDD
+import numpy as np
+import pandas as pd
+import sklearn as sk
+from pyspark.ml.feature import VectorAssembler
+from pyspark.mllib.linalg import Vectors
+from pyspark.ml import Estimator, Model
+
+from ..converters import *
+
+def assemble(sqlCtx, pdf, inputCols, outputCol):
+    tmpDF = sqlCtx.createDataFrame(pdf, list(pdf.columns))
+    assembler = VectorAssembler(inputCols=list(inputCols), outputCol=outputCol)
+    return assembler.transform(tmpDF)
+
+class BaseSystemMLEstimator(Estimator):
+    featuresCol = 'features'
+    labelCol = 'label'
+    
+    def setFeaturesCol(self, colName):
+        """
+        Sets the default column name for features of PySpark DataFrame.
+        
+        Parameters
+        ----------
+        colName: column name for features (default: 'features')
+        """
+        self.featuresCol = colName
+        
+    def setLabelCol(self, colName):
+        """
+        Sets the default column name for features of PySpark DataFrame.
+        
+        Parameters
+        ----------
+        colName: column name for features (default: 'label')
+        """
+        self.labelCol = colName
+        
+    # Returns a model after calling fit(df) on Estimator object on JVM    
+    def _fit(self, X):
+        """
+        Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame
+        
+        Parameters
+        ----------
+        X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label')
+        """
+        if hasattr(X, '_jdf') and self.featuresCol in X.columns and self.labelCol in X.columns:
+            self.model = self.estimator.fit(X._jdf)
+            return self
+        else:
+            raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
+    
+    def fit(self, X, y=None, params=None):
+        """
+        Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types
+        
+        Parameters
+        ----------
+        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+        y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+        """
+        if y is None:
+            return self._fit(X)
+        elif y is not None and isinstance(X, SUPPORTED_TYPES) and isinstance(y, SUPPORTED_TYPES):
+            if self.transferUsingDF:
+                pdfX = convertToPandasDF(X)
+                pdfY = convertToPandasDF(y)
+                if getNumCols(pdfY) != 1:
+                    raise Exception('y should be a column vector')
+                if pdfX.shape[0] != pdfY.shape[0]:
+                    raise Exception('Number of rows of X and y should match')
+                colNames = pdfX.columns
+                pdfX[self.labelCol] = pdfY[pdfY.columns[0]]
+                df = assemble(self.sqlCtx, pdfX, colNames, self.featuresCol).select(self.featuresCol, self.labelCol)
+                self.model = self.estimator.fit(df._jdf)
+            else:
+                numColsy = getNumCols(y)
+                if numColsy != 1:
+                    raise Exception('Expected y to be a column vector')
+                self.model = self.estimator.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y))
+            if self.setOutputRawPredictionsToFalse:
+                self.model.setOutputRawPredictions(False)
+            return self
+        else:
+            raise Exception('Unsupported input type')
+    
+    def transform(self, X):
+        return self.predict(X)
+    
+    # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM    
+    def predict(self, X):
+        """
+        Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types
+        
+        Parameters
+        ----------
+        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
+        """
+        if isinstance(X, SUPPORTED_TYPES):
+            if self.transferUsingDF:
+                pdfX = convertToPandasDF(X)
+                df = assemble(self.sqlCtx, pdfX, pdfX.columns, self.featuresCol).select(self.featuresCol)
+                retjDF = self.model.transform(df._jdf)
+                retDF = DataFrame(retjDF, self.sqlCtx)
+                retPDF = retDF.sort('ID').select('prediction').toPandas()
+                if isinstance(X, np.ndarray):
+                    return retPDF.as_matrix().flatten()
+                else:
+                    return retPDF
+            else:
+                retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
+                if isinstance(X, np.ndarray):
+                    return retNumPy
+                else:
+                    return retNumPy # TODO: Convert to Pandas
+        elif hasattr(X, '_jdf'):
+            if self.featuresCol in X.columns:
+                # No need to assemble as input DF is likely coming via MLPipeline
+                df = X
+            else:
+                assembler = VectorAssembler(inputCols=X.columns, outputCol=self.featuresCol)
+                df = assembler.transform(X)
+            retjDF = self.model.transform(df._jdf)
+            retDF = DataFrame(retjDF, self.sqlCtx)
+            # Return DF
+            return retDF.sort('ID')
+        else:
+            raise Exception('Unsupported input type')
+            
+class BaseSystemMLClassifier(BaseSystemMLEstimator):
+
+    def score(self, X, y):
+        """
+        Scores the predicted value with ground truth 'y'
+        
+        Parameters
+        ----------
+        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+        y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+        """
+        return sk.metrics.accuracy_score(y, self.predict(X))    
+
+class BaseSystemMLRegressor(BaseSystemMLEstimator):
+
+    def score(self, X, y):
+        """
+        Scores the predicted value with ground truth 'y'
+        
+        Parameters
+        ----------
+        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+        y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+        """
+        return sk.metrics.r2_score(y, self.predict(X), multioutput='variance_weighted')
+
+
+class LogisticRegression(BaseSystemMLClassifier):
+    def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+        """
+        Performs both binomial and multinomial logistic regression.
+        
+        Parameters
+        ----------
+        sqlCtx: PySpark SQLContext
+        penalty: Only 'l2' supported
+        fit_intercept: Specifies whether to add intercept or not (default: True)
+        max_iter: Maximum number of outer (Fisher scoring) iterations (default: 100)
+        max_inner_iter: Maximum number of inner (conjugate gradient) iterations, or 0 if no maximum limit provided (default: 0)
+        tol: Tolerance used in the convergence criterion (default: 0.000001)
+        C: 1/regularization parameter (default: 1.0)
+        solver: Only 'newton-cg' solver supported
+        """
+        self.sqlCtx = sqlCtx
+        self.sc = sqlCtx._sc
+        self.uid = "logReg"
+        self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc())
+        self.estimator.setMaxOuterIter(max_iter)
+        self.estimator.setMaxInnerIter(max_inner_iter)
+        if C <= 0:
+            raise Exception('C has to be positive')
+        reg = 1.0 / C
+        self.estimator.setRegParam(reg)
+        self.estimator.setTol(tol)
+        self.estimator.setIcpt(int(fit_intercept))
+        self.transferUsingDF = transferUsingDF
+        self.setOutputRawPredictionsToFalse = True
+        if penalty != 'l2':
+            raise Exception('Only l2 penalty is supported')
+        if solver != 'newton-cg':
+            raise Exception('Only newton-cg solver supported')
+
+class LinearRegression(BaseSystemMLRegressor):
+
+    def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+        """
+        Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables..
+        
+        Parameters
+        ----------
+        sqlCtx: PySpark SQLContext
+        fit_intercept: Specifies whether to add intercept or not (default: True)
+        max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100)
+        tol: Tolerance used in the convergence criterion (default: 0.000001)
+        C: 1/regularization parameter (default: 1.0)
+        solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').  
+        Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient.
+        'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and
+        input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient.
+        """
+        self.sqlCtx = sqlCtx
+        self.sc = sqlCtx._sc
+        self.uid = "lr"
+        if solver == 'newton-cg' or solver == 'direct-solve':
+            self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LinearRegression(self.uid, self.sc._jsc.sc(), solver)
+        else:
+            raise Exception('Only newton-cg solver supported')
+        self.estimator.setMaxIter(max_iter)
+        if C <= 0:
+            raise Exception('C has to be positive')
+        reg = 1.0 / C
+        self.estimator.setRegParam(reg)
+        self.estimator.setTol(tol)
+        self.estimator.setIcpt(int(fit_intercept))
+        self.transferUsingDF = transferUsingDF
+        self.setOutputRawPredictionsToFalse = False
+
+
+class SVM(BaseSystemMLClassifier):
+
+    def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
+        """
+        Performs both binary-class and multiclass SVM (Support Vector Machines).
+        
+        Parameters
+        ----------
+        sqlCtx: PySpark SQLContext
+        fit_intercept: Specifies whether to add intercept or not (default: True)
+        max_iter: Maximum number iterations (default: 100)
+        tol: Tolerance used in the convergence criterion (default: 0.000001)
+        C: 1/regularization parameter (default: 1.0)
+        is_multi_class: Specifies whether to use binary-class SVM or multi-class SVM algorithm (default: False)
+        """
+        self.sqlCtx = sqlCtx
+        self.sc = sqlCtx._sc
+        self.uid = "svm"
+        self.estimator = self.sc._jvm.org.apache.sysml.api.ml.SVM(self.uid, self.sc._jsc.sc(), is_multi_class)
+        self.estimator.setMaxIter(max_iter)
+        if C <= 0:
+            raise Exception('C has to be positive')
+        reg = 1.0 / C
+        self.estimator.setRegParam(reg)
+        self.estimator.setTol(tol)
+        self.estimator.setIcpt(int(fit_intercept))
+        self.transferUsingDF = transferUsingDF
+        self.setOutputRawPredictionsToFalse = False    
+
+class NaiveBayes(BaseSystemMLClassifier):
+
+    def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
+        """
+        Performs both binary-class and multiclass SVM (Support Vector Machines).
+        
+        Parameters
+        ----------
+        sqlCtx: PySpark SQLContext
+        laplace: Laplace smoothing specified by the user to avoid creation of 0 probabilities (default: 1.0)
+        """
+        self.sqlCtx = sqlCtx
+        self.sc = sqlCtx._sc
+        self.uid = "nb"
+        self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc())
+        self.estimator.setLaplace(laplace)
+        self.transferUsingDF = transferUsingDF
+        self.setOutputRawPredictionsToFalse = False
+
+__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/tests/test_mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mlcontext.py b/src/main/python/tests/test_mlcontext.py
index 182a4d8..6a6f64e 100644
--- a/src/main/python/tests/test_mlcontext.py
+++ b/src/main/python/tests/test_mlcontext.py
@@ -23,7 +23,7 @@ import unittest
 
 from pyspark.context import SparkContext
 
-from SystemML import MLContext, dml, pydml
+from systemml import MLContext, dml, pydml
 
 sc = SparkContext()
 ml = MLContext(sc)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/tests/test_mllearn.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mllearn.py b/src/main/python/tests/test_mllearn.py
index 22f798f..27b9813 100644
--- a/src/main/python/tests/test_mllearn.py
+++ b/src/main/python/tests/test_mllearn.py
@@ -20,7 +20,7 @@
 #
 #-------------------------------------------------------------
 from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression, LinearRegression, SVM, NaiveBayes 
+from systemml.mllearn import LogisticRegression, LinearRegression, SVM, NaiveBayes 
 from pyspark.sql import SQLContext
 from pyspark.context import SparkContext
 import unittest

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/uploadToPyPI.sh
----------------------------------------------------------------------
diff --git a/src/main/python/uploadToPyPI.sh b/src/main/python/uploadToPyPI.sh
deleted file mode 100644
index c892f3d..0000000
--- a/src/main/python/uploadToPyPI.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-cd ../../..
-mvn clean package -P distribution
-tar -xzf target/systemml-*-SNAPSHOT.tar.gz -C src/main/python/SystemML
-
-cd src/main/python/SystemML
-mv systemml-*-incubating-SNAPSHOT SystemML-java
-
-cd ..
-echo "Preparing to upload to PyPI ...."
-python setup.py register sdist upload
-
-rm -r SystemML/SystemML-java
\ No newline at end of file


[2/2] incubator-systemml git commit: [SYSTEMML-878] Update the Python package from SystemML to systemml

Posted by ni...@apache.org.
[SYSTEMML-878] Update the Python package from SystemML to systemml

- Updated Python package name from SystemML to systemml
- Moved uploadToPyPI.sh script to dev/release
- Updated the documentation

Closes #231.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/542de374
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/542de374
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/542de374

Branch: refs/heads/master
Commit: 542de374e19e25e3581de45af06d37a2f4cdaad0
Parents: 7610a21
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 1 16:55:57 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 1 16:55:57 2016 -0700

----------------------------------------------------------------------
 dev/release/uploadToPyPI.sh                    |  34 ++
 docs/algorithms-classification.md              |  18 +-
 docs/algorithms-regression.md                  |   8 +-
 docs/beginners-guide-python.md                 |  49 ++-
 src/main/python/MANIFEST.in                    |  18 +-
 src/main/python/SystemML/__init__.py           |  29 --
 src/main/python/SystemML/converters.py         | 100 -----
 src/main/python/SystemML/defmatrix.py          | 410 --------------------
 src/main/python/SystemML/mlcontext.py          | 302 --------------
 src/main/python/SystemML/mllearn/__init__.py   |  25 --
 src/main/python/SystemML/mllearn/estimators.py | 302 --------------
 src/main/python/setup.py                       |   4 +-
 src/main/python/systemml/__init__.py           |  29 ++
 src/main/python/systemml/converters.py         | 100 +++++
 src/main/python/systemml/defmatrix.py          | 410 ++++++++++++++++++++
 src/main/python/systemml/mlcontext.py          | 302 ++++++++++++++
 src/main/python/systemml/mllearn/__init__.py   |  25 ++
 src/main/python/systemml/mllearn/estimators.py | 302 ++++++++++++++
 src/main/python/tests/test_mlcontext.py        |   2 +-
 src/main/python/tests/test_mllearn.py          |   2 +-
 src/main/python/uploadToPyPI.sh                |  34 --
 21 files changed, 1265 insertions(+), 1240 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/dev/release/uploadToPyPI.sh
----------------------------------------------------------------------
diff --git a/dev/release/uploadToPyPI.sh b/dev/release/uploadToPyPI.sh
new file mode 100644
index 0000000..44a6b4f
--- /dev/null
+++ b/dev/release/uploadToPyPI.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+cd ../..
+mvn clean package -P distribution
+tar -xzf target/systemml-*-SNAPSHOT.tar.gz -C src/main/python/systemml
+ 
+cd src/main/python/systemml
+mv systemml-*-incubating-SNAPSHOT systemml-java
+
+cd ..
+echo "Preparing to upload to PyPI ...."
+python setup.py register sdist upload -r https://pypi.python.org/pypi
+
+rm -r systemml/systemml-java
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/algorithms-classification.md
----------------------------------------------------------------------
diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md
index 340267c..8d19d04 100644
--- a/docs/algorithms-classification.md
+++ b/docs/algorithms-classification.md
@@ -129,7 +129,7 @@ Eqs.�(1) and�(2).
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
 # C = 1/reg
 logistic = LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
 # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -237,7 +237,7 @@ SystemML Language Reference for details.
 {% highlight python %}
 # Scikit-learn way
 from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
 from pyspark.sql import SQLContext
 sqlCtx = SQLContext(sc)
 digits = datasets.load_digits()
@@ -253,7 +253,7 @@ print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_te
 
 # MLPipeline way
 from pyspark.ml import Pipeline
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.sql import SQLContext
 sqlCtx = SQLContext(sc)
@@ -498,7 +498,7 @@ support vector machine (`y` with domain size `2`).
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
 # C = 1/reg
 svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
 # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -766,7 +766,7 @@ class labels.
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
 # C = 1/reg
 svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
 # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -916,7 +916,7 @@ SystemML Language Reference for details.
 {% highlight python %}
 # Scikit-learn way
 from sklearn import datasets, neighbors
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
 from pyspark.sql import SQLContext
 sqlCtx = SQLContext(sc)
 digits = datasets.load_digits()
@@ -932,7 +932,7 @@ print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y
 
 # MLPipeline way
 from pyspark.ml import Pipeline
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.sql import SQLContext
 sqlCtx = SQLContext(sc)
@@ -1122,7 +1122,7 @@ applicable when all features are counts of categorical values.
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-from SystemML.mllearn import NaiveBayes
+from systemml.mllearn import NaiveBayes
 nb = NaiveBayes(sqlCtx, laplace=1.0)
 # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
 y_test = nb.fit(X_train, y_train)
@@ -1257,7 +1257,7 @@ SystemML Language Reference for details.
 {% highlight python %}
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer
-from SystemML.mllearn import NaiveBayes
+from systemml.mllearn import NaiveBayes
 from sklearn import metrics
 from pyspark.sql import SQLContext
 sqlCtx = SQLContext(sc)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/algorithms-regression.md
----------------------------------------------------------------------
diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md
index 6585b00..992862e 100644
--- a/docs/algorithms-regression.md
+++ b/docs/algorithms-regression.md
@@ -82,7 +82,7 @@ efficient when the number of features $m$ is relatively small
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
 # C = 1/reg
 lr = LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
 # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -124,7 +124,7 @@ y_test = lr.fit(df_train)
 <div class="codetabs">
 <div data-lang="Python" markdown="1">
 {% highlight python %}
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
 # C = 1/reg
 lr = LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
 # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices
@@ -222,7 +222,7 @@ SystemML Language Reference for details.
 {% highlight python %}
 import numpy as np
 from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
 from pyspark.sql import SQLContext
 # Load the diabetes dataset
 diabetes = datasets.load_diabetes()
@@ -277,7 +277,7 @@ print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) -
 {% highlight python %}
 import numpy as np
 from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
 from pyspark.sql import SQLContext
 # Load the diabetes dataset
 diabetes = datasets.load_diabetes()

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/beginners-guide-python.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md
index 3b4aeed..f040212 100644
--- a/docs/beginners-guide-python.md
+++ b/docs/beginners-guide-python.md
@@ -72,7 +72,7 @@ brew install apache-spark
 #### Step 1: Install SystemML Python package 
 
 ```bash
-pip install SystemML
+pip install systemml
 ```
 
 #### Step 2: Download SystemML Java binaries
@@ -81,14 +81,14 @@ SystemML Python package downloads the corresponding Java binaries (along with al
 into the installed location. To find the location of the downloaded Java binaries, use the following command:
 
 ```bash
-python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'
+python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'
 ```
 
 #### Step 3: (Optional but recommended) Set SYSTEMML_HOME environment variable
 <div class="codetabs">
 <div data-lang="OSX" markdown="1">
 ```bash
-SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'`
+SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'`
 # If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively.
 echo '' >> ~/.bashrc
 echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
@@ -96,7 +96,7 @@ echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
 </div>
 <div data-lang="Linux" markdown="1">
 ```bash
-SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'`
+SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'`
 # If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively.
 echo '' >> ~/.bashrc
 echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
@@ -128,7 +128,7 @@ pyspark --master local[*] --driver-class-path $SYSTEMML_HOME"/SystemML.jar"
 To get started with SystemML, let's try few elementary matrix multiplication operations:
 
 ```python
-import SystemML as sml
+import systemml as sml
 import numpy as np
 sml.setSparkContext(sc)
 m1 = sml.matrix(np.ones((3,3)) + 2)
@@ -152,7 +152,7 @@ model: $ \beta = solve(X^T X, X^T y) $. For simplicity, we will use direct-solve
 ```python
 import numpy as np
 from sklearn import datasets
-import SystemML as sml
+import systemml as sml
 from pyspark.sql import SQLContext
 # Load the diabetes dataset
 diabetes = datasets.load_diabetes()
@@ -196,7 +196,7 @@ algorithm.
 ```python
 import numpy as np
 from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
 from pyspark.sql import SQLContext
 # Load the diabetes dataset
 diabetes = datasets.load_diabetes()
@@ -230,7 +230,7 @@ algorithm on digits datasets.
 ```python
 # Scikit-learn way
 from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
 from pyspark.sql import SQLContext
 sqlCtx = SQLContext(sc)
 digits = datasets.load_digits()
@@ -245,15 +245,21 @@ logistic = LogisticRegression(sqlCtx)
 print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
 ```
 
+Output:
+
+```bash
+LogisticRegression score: 0.922222
+```
+
 ### Passing PySpark DataFrame
 
 To train the above algorithm on larger dataset, we can load the dataset into DataFrame and pass it to the `fit` method:
 
 ```python
 from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
 from pyspark.sql import SQLContext
-import SystemML as sml
+import systemml as sml
 sqlCtx = SQLContext(sc)
 digits = datasets.load_digits()
 X_digits = digits.data
@@ -267,6 +273,12 @@ logistic = LogisticRegression(sqlCtx)
 print('LogisticRegression score: %f' % logistic.fit(df_train).score(X_test, y_test))
 ```
 
+Output:
+
+```bash
+LogisticRegression score: 0.922222
+```
+
 ### MLPipeline interface
 
 In the below example, we demonstrate how the same `LogisticRegression` class can allow SystemML to fit seamlessly into 
@@ -275,7 +287,7 @@ large data pipelines.
 ```python
 # MLPipeline way
 from pyspark.ml import Pipeline
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.sql import SQLContext
 sqlCtx = SQLContext(sc)
@@ -307,6 +319,19 @@ prediction = model.transform(test)
 prediction.show()
 ```
 
+Output:
+
+```bash
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+|id|           text|               words|            features|         probability| ID|prediction|
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+|12|    spark i j k|ArrayBuffer(spark...|(20,[5,6,7],[2.0,...|[0.99999999999975...|1.0|       1.0|
+|13|          l m n|ArrayBuffer(l, m, n)|(20,[8,9,10],[1.0...|[1.37552128844736...|2.0|       2.0|
+|14|mapreduce spark|ArrayBuffer(mapre...|(20,[5,10],[1.0,1...|[0.99860290938153...|3.0|       1.0|
+|15|  apache hadoop|ArrayBuffer(apach...|(20,[9,14],[1.0,1...|[5.41688748236143...|4.0|       2.0|
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+```
+
 ## Invoking DML/PyDML scripts using MLContext
 
 The below example demonstrates how to invoke the algorithm [scripts/algorithms/MultiLogReg.dml](https://github.com/apache/incubator-systemml/blob/master/scripts/algorithms/MultiLogReg.dml)
@@ -315,7 +340,7 @@ using Python [MLContext API](https://apache.github.io/incubator-systemml/spark-m
 ```python
 from sklearn import datasets, neighbors
 from pyspark.sql import DataFrame, SQLContext
-import SystemML as sml
+import systemml as sml
 import pandas as pd
 import os
 sqlCtx = SQLContext(sc)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/MANIFEST.in
----------------------------------------------------------------------
diff --git a/src/main/python/MANIFEST.in b/src/main/python/MANIFEST.in
index a185263..a0835d2 100644
--- a/src/main/python/MANIFEST.in
+++ b/src/main/python/MANIFEST.in
@@ -18,12 +18,12 @@
 # under the License.
 #
 #-------------------------------------------------------------
-include SystemML/SystemML-java/LICENSE
-include SystemML/SystemML-java/SystemML-config.xml
-include SystemML/SystemML-java/NOTICE
-include SystemML/SystemML-java/SystemML.jar
-include SystemML/SystemML-java/DISCLAIMER
-include SystemML/SystemML-java/scripts/sparkDML.sh
-recursive-include SystemML/SystemML-java/scripts/algorithms *
-recursive-include SystemML/SystemML-java/scripts/datagen *
-recursive-include SystemML/SystemML-java/scripts/utils *
\ No newline at end of file
+include systemml/systemml-java/LICENSE
+include systemml/systemml-java/SystemML-config.xml
+include systemml/systemml-java/NOTICE
+include systemml/systemml-java/SystemML.jar
+include systemml/systemml-java/DISCLAIMER
+include systemml/systemml-java/scripts/sparkDML.sh
+recursive-include systemml/systemml-java/scripts/algorithms *
+recursive-include systemml/systemml-java/scripts/datagen *
+recursive-include systemml/systemml-java/scripts/utils *
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/__init__.py b/src/main/python/SystemML/__init__.py
deleted file mode 100644
index 02a940b..0000000
--- a/src/main/python/SystemML/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from .mlcontext import *
-from .defmatrix import *
-from .converters import *
-
-__all__ = mlcontext.__all__
-__all__ += defmatrix.__all__
-__all__ += converters.__all__
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/converters.py b/src/main/python/SystemML/converters.py
deleted file mode 100644
index 9588bec..0000000
--- a/src/main/python/SystemML/converters.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from pyspark.context import SparkContext 
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
-import numpy as np
-import pandas as pd
-import sklearn as sk
-
-from scipy.sparse import spmatrix
-from scipy.sparse import coo_matrix
-
-SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix)
-
-def getNumCols(numPyArr):
-    if numPyArr.ndim == 1:
-        return 1
-    else:
-        return numPyArr.shape[1]
-       
-def convertToLabeledDF(sqlCtx, X, y=None):
-    from pyspark.ml.feature import VectorAssembler
-    if y is not None:
-        pd1 = pd.DataFrame(X)
-        pd2 = pd.DataFrame(y, columns=['label'])
-        pdf = pd.concat([pd1, pd2], axis=1)
-        inputColumns = ['C' + str(i) for i in pd1.columns]
-        outputColumns = inputColumns + ['label']
-    else:
-        pdf = pd.DataFrame(X)
-        inputColumns = ['C' + str(i) for i in pdf.columns]
-        outputColumns = inputColumns
-    assembler = VectorAssembler(inputCols=inputColumns, outputCol='features')
-    out = assembler.transform(sqlCtx.createDataFrame(pdf, outputColumns))
-    if y is not None:
-        return out.select('features', 'label')
-    else:
-        return out.select('features')
-    
-
-def convertToMatrixBlock(sc, src):
-    if isinstance(src, spmatrix):
-        src = coo_matrix(src,  dtype=np.float64)
-        numRows = src.shape[0]
-        numCols = src.shape[1]
-        data = src.data
-        row = src.row.astype(np.int32)
-        col = src.col.astype(np.int32)
-        nnz = len(src.col)
-        buf1 = bytearray(data.tostring())
-        buf2 = bytearray(row.tostring())
-        buf3 = bytearray(col.tostring())
-        return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz)
-    elif isinstance(sc, SparkContext):
-        src = np.asarray(src)
-        numCols = getNumCols(src)
-        numRows = src.shape[0]
-        arr = src.ravel().astype(np.float64)
-        buf = bytearray(arr.tostring())
-        return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
-    else:
-        raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
-    
-
-def convertToNumpyArr(sc, mb):
-    if isinstance(sc, SparkContext):
-        numRows = mb.getNumRows()
-        numCols = mb.getNumColumns()
-        buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb)
-        return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64).reshape((numRows, numCols))
-    else:
-        raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
-
-
-def convertToPandasDF(X):
-    if not isinstance(X, pd.DataFrame):
-        return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))])
-    return X
-    
-__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/defmatrix.py b/src/main/python/SystemML/defmatrix.py
deleted file mode 100644
index 18f6314..0000000
--- a/src/main/python/SystemML/defmatrix.py
+++ /dev/null
@@ -1,410 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-import numpy as np
-
-from . import pydml, MLContext
-from .converters import *
-from pyspark import SparkContext, RDD
-from pyspark.sql import DataFrame, SQLContext
-
-def setSparkContext(sc):
-    """
-    Before using the matrix, the user needs to invoke this function.
-
-    Parameters
-    ----------
-    sc: SparkContext
-        SparkContext
-    """
-    matrix.ml = MLContext(sc)
-    matrix.sc = sc
-
-def checkIfMLContextIsSet():
-    if matrix.ml is None:
-        raise Exception('Expected setSparkContext(sc) to be called.')
-
-class DMLOp(object):
-    """
-    Represents an intermediate node of Abstract syntax tree created to generate the PyDML script
-    """
-    def __init__(self, inputs, dml=None):
-        self.inputs = inputs
-        self.dml = dml
-
-    def _visit(self, execute=True):
-        matrix.dml = matrix.dml + self.dml
-
-
-def reset():
-    """
-    Resets the visited status of matrix and the operators in the generated AST.
-    """
-    for m in matrix.visited:
-        m.visited = False
-    matrix.visited = []
-
-def binaryOp(lhs, rhs, opStr):
-    """
-    Common function called by all the binary operators in matrix class
-    """
-    inputs = []
-    if isinstance(lhs, matrix):
-        lhsStr = lhs.ID
-        inputs = [lhs]
-    elif isinstance(lhs, float) or isinstance(lhs, int):
-        lhsStr = str(lhs)
-    else:
-        raise TypeError('Incorrect type')
-    if isinstance(rhs, matrix):
-        rhsStr = rhs.ID
-        inputs = inputs + [rhs]
-    elif isinstance(rhs, float) or isinstance(rhs, int):
-        rhsStr = str(rhs)
-    else:
-        raise TypeError('Incorrect type')
-    dmlOp = DMLOp(inputs)
-    out = matrix(None, op=dmlOp)
-    dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n']
-    return out
-
-def binaryMatrixFunction(X, Y, fnName):
-    """
-    Common function called by supported PyDML built-in function that has two arguments both of which are matrices.
-    TODO: This needs to be generalized to support arbitrary arguments of differen types.
-    """
-    if not isinstance(X, matrix) or not isinstance(Y, matrix):
-        raise TypeError('Incorrect input type. Expected matrix type')
-    inputs = [X, Y]
-    dmlOp = DMLOp(inputs)
-    out = matrix(None, op=dmlOp)
-    dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n']
-    return out
-
-def solve(A, b):
-    """
-    Computes the least squares solution for system of linear equations A %*% x = b
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from sklearn import datasets
-    >>> import SystemML as sml
-    >>> from pyspark.sql import SQLContext
-    >>> diabetes = datasets.load_diabetes()
-    >>> diabetes_X = diabetes.data[:, np.newaxis, 2]
-    >>> X_train = diabetes_X[:-20]
-    >>> X_test = diabetes_X[-20:]
-    >>> y_train = diabetes.target[:-20]
-    >>> y_test = diabetes.target[-20:]
-    >>> sml.setSparkContext(sc)
-    >>> X = sml.matrix(X_train)
-    >>> y = sml.matrix(y_train)
-    >>> A = X.transpose().dot(X)
-    >>> b = X.transpose().dot(y)
-    >>> beta = sml.solve(A, b).toNumPyArray()
-    >>> y_predicted = X_test.dot(beta)
-    >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
-    Residual sum of squares: 25282.12
-    """
-    return binaryMatrixFunction(A, b, 'solve')
-
-def eval(outputs, outputDF=False, execute=True):
-    """
-    Executes the unevaluated DML script and computes the matrices specified by outputs.
-
-    Parameters
-    ----------
-    outputs: list of matrices
-    outputDF: back the data of matrix as PySpark DataFrame
-    """
-    checkIfMLContextIsSet()
-    reset()
-    matrix.dml = []
-    matrix.script = pydml('')
-    if isinstance(outputs, matrix):
-        outputs = [ outputs ]
-    elif not isinstance(outputs, list):
-        raise TypeError('Incorrect input type')
-    for m in outputs:
-        m.output = True
-        m._visit(execute=execute)
-    if not execute:
-        return ''.join(matrix.dml)
-    matrix.script.scriptString = ''.join(matrix.dml)
-    results = matrix.ml.execute(matrix.script)
-    # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
-    for m in outputs:
-        if outputDF:
-            m.data = results.getDataFrame(m.ID)
-        else:
-            m.data = results.getNumPyArray(m.ID)
-
-class matrix(object):
-    """
-    matrix class is a python wrapper that implements basic matrix operator.
-    Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
-
-    Examples
-    --------
-    >>> import SystemML as sml
-    >>> import numpy as np
-    >>> sml.setSparkContext(sc)
-
-    Welcome to Apache SystemML!
-
-    >>> m1 = sml.matrix(np.ones((3,3)) + 2)
-    >>> m2 = sml.matrix(np.ones((3,3)) + 3)
-    >>> m2 = m1 * (m2 + m1)
-    >>> m4 = 1.0 - m2
-    >>> m4
-    # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
-    mVar1 = load(" ", format="csv")
-    mVar2 = load(" ", format="csv")
-    mVar3 = mVar2 + mVar1
-    mVar4 = mVar1 * mVar3
-    mVar5 = 1.0 - mVar4
-    save(mVar5, " ")
-
-    <SystemML.defmatrix.matrix object>
-    >>> m2.eval()
-    >>> m2
-    # This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.
-    <SystemML.defmatrix.matrix object>
-    >>> m4
-    # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
-    mVar4 = load(" ", format="csv")
-    mVar5 = 1.0 - mVar4
-    save(mVar5, " ")
-
-    <SystemML.defmatrix.matrix object>
-    >>> m4.sum(axis=1).toNumPyArray()
-    array([[-60.],
-           [-60.],
-           [-60.]])
-    """
-    # Global variable that is used to keep track of intermediate matrix variables in the DML script
-    systemmlVarID = 0
-
-    # Since joining of string is expensive operation, we collect the set of strings into list and then join
-    # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method
-    dml = []
-
-    # Represents MLContext's script object
-    script = None
-
-    # Represents MLContext object
-    ml = None
-
-    # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects
-    # that have been previously evaluated.
-    visited = []
-
-    def __init__(self, data, op=None):
-        """
-        Constructs a lazy matrix
-
-        Parameters
-        ----------
-        data: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame. (data cannot be None for external users, 'data=None' is used internally for lazy evaluation).
-        """
-        checkIfMLContextIsSet()
-        self.visited = False
-        matrix.systemmlVarID += 1
-        self.output = False
-        self.ID = 'mVar' + str(matrix.systemmlVarID)
-        if isinstance(data, SUPPORTED_TYPES):
-            self.data = data
-        elif hasattr(data, '_jdf'):
-            self.data = data
-        elif data is None and op is not None:
-            self.data = None
-            # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation
-            self.op = op
-        else:
-            raise TypeError('Unsupported input type')
-
-    def eval(self, outputDF=False):
-        """
-        This is a convenience function that calls the global eval method
-        """
-        eval([self], outputDF=False)
-
-    def toPandas(self):
-        """
-        This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame.
-        """
-        if self.data is None:
-            self.eval()
-        return convertToPandasDF(self.data)
-
-    def toNumPyArray(self):
-        """
-        This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array.
-        """
-        if self.data is None:
-            self.eval()
-        if isinstance(self.data, DataFrame):
-            self.data = self.data.toPandas().as_matrix()
-        # Always keep default format as NumPy array if possible
-        return self.data
-
-    def toDataFrame(self):
-        """
-        This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame.
-        """
-        if self.data is None:
-            self.eval(outputDF=True)
-        if not isinstance(self.data, DataFrame):
-            if MLResults.sqlContext is None:
-                MLResults.sqlContext = SQLContext(matrix.sc)
-            self.data = sqlContext.createDataFrame(self.toPandas())
-        return self.data
-
-    def _visit(self, execute=True):
-        """
-        This function is called for two scenarios:
-        1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method.
-        2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself
-        and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext.
-        """
-        if self.visited:
-            return self
-        self.visited = True
-        # for cleanup
-        matrix.visited = matrix.visited + [ self ]
-        if self.data is not None:
-            matrix.dml = matrix.dml + [ self.ID,  ' = load(\" \", format=\"csv\")\n']
-            if isinstance(self.data, DataFrame) and execute:
-                matrix.script.input(self.ID, self.data)
-            elif  execute:
-                matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data))
-            return self
-        elif self.op is not None:
-            for m in self.op.inputs:
-                m._visit(execute=execute)
-            self.op._visit(execute=execute)
-        else:
-            raise Exception('Expected either op or data to be set')
-        if self.data is None and self.output:
-            matrix.dml = matrix.dml + ['save(',  self.ID, ', \" \")\n']
-            if execute:
-                matrix.script.output(self.ID)
-        return self
-
-    def __repr__(self):
-        """
-        This function helps to debug matrix class and also examine the generated PyDML script
-        """
-        if self.data is None:
-            print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False))
-        elif isinstance(self.data, DataFrame):
-            print('# This matrix (' + self.ID + ') is backed by PySpark DataFrame. To fetch the DataFrame, invoke toDataFrame() method.')
-        else:
-            print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.')
-        return '<SystemML.defmatrix.matrix object>'
-
-    def __add__(self, other):
-        return binaryOp(self, other, ' + ')
-
-    def __sub__(self, other):
-        return binaryOp(self, other, ' - ')
-
-    def __mul__(self, other):
-        return binaryOp(self, other, ' * ')
-
-    def __floordiv__(self, other):
-        return binaryOp(self, other, ' // ')
-
-    def __div__(self, other):
-        return binaryOp(self, other, ' / ')
-
-    def __mod__(self, other):
-        return binaryOp(self, other, ' % ')
-
-    def __pow__(self, other):
-        return binaryOp(self, other, ' ** ')
-
-    def __radd__(self, other):
-        return binaryOp(other, self, ' + ')
-
-    def __rsub__(self, other):
-        return binaryOp(other, self, ' - ')
-
-    def __rmul__(self, other):
-        return binaryOp(other, self, ' * ')
-
-    def __rfloordiv__(self, other):
-        return binaryOp(other, self, ' // ')
-
-    def __rdiv__(self, other):
-        return binaryOp(other, self, ' / ')
-
-    def __rmod__(self, other):
-        return binaryOp(other, self, ' % ')
-
-    def __rpow__(self, other):
-        return binaryOp(other, self, ' ** ')
-
-    def sum(self, axis=None):
-        return self._aggFn('sum', axis)
-
-    def mean(self, axis=None):
-        return self._aggFn('mean', axis)
-
-    def max(self, axis=None):
-        return self._aggFn('max', axis)
-
-    def min(self, axis=None):
-        return self._aggFn('min', axis)
-
-    def argmin(self, axis=None):
-        return self._aggFn('argmin', axis)
-
-    def argmax(self, axis=None):
-        return self._aggFn('argmax', axis)
-
-    def cumsum(self, axis=None):
-        return self._aggFn('cumsum', axis)
-
-    def transpose(self, axis=None):
-        return self._aggFn('transpose', axis)
-
-    def trace(self, axis=None):
-        return self._aggFn('trace', axis)
-
-    def _aggFn(self, fnName, axis):
-        """
-        Common function that is called for functions that have axis as parameter.
-        """
-        dmlOp = DMLOp([self])
-        out = matrix(None, op=dmlOp)
-        if axis is None:
-            dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ')\n']
-        else:
-            dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n']
-        return out
-
-    def dot(self, other):
-        return binaryMatrixFunction(self, other, 'dot')
-
-__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/mlcontext.py b/src/main/python/SystemML/mlcontext.py
deleted file mode 100644
index 1b90e70..0000000
--- a/src/main/python/SystemML/mlcontext.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-import os
-
-try:
-    import py4j.java_gateway
-    from py4j.java_gateway import JavaObject
-except ImportError:
-    raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark')
-
-from pyspark import SparkContext
-import pyspark.mllib.common
-from pyspark.sql import DataFrame, SQLContext
-from .converters import *
-
-def dml(scriptString):
-    """
-    Create a dml script object based on a string.
-
-    Parameters
-    ----------
-    scriptString: string
-        Can be a path to a dml script or a dml script itself.
-
-    Returns
-    -------
-    script: Script instance
-        Instance of a script object.
-    """
-    if not isinstance(scriptString, str):
-        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
-    return Script(scriptString, scriptType="dml")
-
-
-def pydml(scriptString):
-    """
-    Create a pydml script object based on a string.
-
-    Parameters
-    ----------
-    scriptString: string
-        Can be a path to a pydml script or a pydml script itself.
-
-    Returns
-    -------
-    script: Script instance
-        Instance of a script object.
-    """
-    if not isinstance(scriptString, str):
-        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
-    return Script(scriptString, scriptType="pydml")
-
-
-def _java2py(sc, obj):
-    """ Convert Java object to Python. """
-    # TODO: Port this private PySpark function.
-    obj = pyspark.mllib.common._java2py(sc, obj)
-    if isinstance(obj, JavaObject):
-        class_name = obj.getClass().getSimpleName()
-        if class_name == 'Matrix':
-            obj = Matrix(obj, sc)
-    return obj
-
-
-def _py2java(sc, obj):
-    """ Convert Python object to Java. """
-    if isinstance(obj, Matrix):
-        obj = obj._java_matrix
-    # TODO: Port this private PySpark function.
-    obj = pyspark.mllib.common._py2java(sc, obj)
-    return obj
-
-
-class Matrix(object):
-    """
-    Wrapper around a Java Matrix object.
-
-    Parameters
-    ----------
-    javaMatrix: JavaObject
-        A Java Matrix object as returned by calling `ml.execute().get()`.
-
-    sc: SparkContext
-        SparkContext
-    """
-    def __init__(self, javaMatrix, sc):
-        self._java_matrix = javaMatrix
-        self.sc = sc
-
-    def __repr__(self):
-        return "Matrix"
-
-    def toDF(self):
-        """
-        Convert the Matrix to a PySpark SQL DataFrame.
-
-        Returns
-        -------
-        df: PySpark SQL DataFrame
-            A PySpark SQL DataFrame representing the matrix, with
-            one "ID" column containing the row index (since Spark
-            DataFrames are unordered), followed by columns of doubles
-            for each column in the matrix.
-        """
-        jdf = self._java_matrix.asDataFrame()
-        df = _java2py(self.sc, jdf)
-        return df
-
-
-class MLResults(object):
-    """
-    Wrapper around a Java ML Results object.
-
-    Parameters
-    ----------
-    results: JavaObject
-        A Java MLResults object as returned by calling `ml.execute()`.
-
-    sc: SparkContext
-        SparkContext
-    """
-    def __init__(self, results, sc):
-        self._java_results = results
-        self.sc = sc
-        try:
-            if MLResults.sqlContext is None:
-                MLResults.sqlContext = SQLContext(sc)
-        except AttributeError:
-            MLResults.sqlContext = SQLContext(sc)
-
-    def __repr__(self):
-        return "MLResults"
-
-    def getNumPyArray(self, *outputs):
-        """
-        Parameters
-        ----------
-        outputs: string, list of strings
-            Output variables as defined inside the DML script.
-        """
-        outs = [convertToNumpyArr(self.sc, self._java_results.getMatrix(out).asBinaryBlockMatrix().getMatrixBlock()) for out in outputs]
-        if len(outs) == 1:
-            return outs[0]
-        return outs
-
-    def getDataFrame(self, *outputs):
-        """
-        Parameters
-        ----------
-        outputs: string, list of strings
-            Output variables as defined inside the DML script.
-        """
-        outs = [DataFrame(self._java_results.getDataFrame(out), MLResults.sqlContext) for out in outputs]
-        if len(outs) == 1:
-            return outs[0]
-        return outs
-
-    def get(self, *outputs):
-        """
-        Parameters
-        ----------
-        outputs: string, list of strings
-            Output variables as defined inside the DML script.
-        """
-        outs = [_java2py(self.sc, self._java_results.get(out)) for out in outputs]
-        if len(outs) == 1:
-            return outs[0]
-        return outs
-
-
-class Script(object):
-    """
-    Instance of a DML/PyDML Script.
-
-    Parameters
-    ----------
-    scriptString: string
-        Can be either a file path to a DML script or a DML script itself.
-
-    scriptType: string
-        Script language, either "dml" for DML (R-like) or "pydml" for PyDML (Python-like).
-    """
-    def __init__(self, scriptString, scriptType="dml"):
-        self.scriptString = scriptString
-        self.scriptType = scriptType
-        self._input = {}
-        self._output = []
-
-    def input(self, *args, **kwargs):
-        """
-        Parameters
-        ----------
-        args: name, value tuple
-            where name is a string, and currently supported value formats
-            are double, string, dataframe, rdd, and list of such object.
-
-        kwargs: dict of name, value pairs
-            To know what formats are supported for name and value, look above.
-        """
-        if args and len(args) != 2:
-            raise ValueError("Expected name, value pair.")
-        elif args:
-            self._input[args[0]] = args[1]
-        for name, value in kwargs.items():
-            self._input[name] = value
-        return self
-
-    def output(self, *names):
-        """
-        Parameters
-        ----------
-        names: string, list of strings
-            Output variables as defined inside the DML script.
-        """
-        self._output.extend(names)
-        return self
-
-
-class MLContext(object):
-    """
-    Wrapper around the new SystemML MLContext.
-
-    Parameters
-    ----------
-    sc: SparkContext
-        SparkContext
-    """
-    def __init__(self, sc):
-        if not isinstance(sc, SparkContext):
-            raise ValueError("Expected sc to be a SparkContext, got " % sc)
-        self._sc = sc
-        self._ml = sc._jvm.org.apache.sysml.api.mlcontext.MLContext(sc._jsc)
-
-    def __repr__(self):
-        return "MLContext"
-
-    def execute(self, script):
-        """
-        Execute a DML / PyDML script.
-
-        Parameters
-        ----------
-        script: Script instance
-            Script instance defined with the appropriate input and output variables.
-
-        Returns
-        -------
-        ml_results: MLResults
-            MLResults instance.
-        """
-        if not isinstance(script, Script):
-            raise ValueError("Expected script to be an instance of Script")
-        scriptString = script.scriptString
-        if script.scriptType == "dml":
-            if scriptString.endswith(".dml"):
-                if os.path.exists(scriptString):
-                    script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dmlFromFile(scriptString)
-                else:
-                    raise ValueError("path: %s does not exist" % scriptString)
-            else:
-                script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dml(scriptString)
-        elif script.scriptType == "pydml":
-            if scriptString.endswith(".pydml"):
-                if os.path.exists(scriptString):
-                    script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydmlFromFile(scriptString)
-                else:
-                    raise ValueError("path: %s does not exist" % scriptString)
-            else:
-                script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString)
-
-        for key, val in script._input.items():
-            # `in` is a reserved word ("keyword") in Python, so `script_java.in(...)` is not
-            # allowed. Therefore, we use the following code in which we retrieve a function
-            # representing `script_java.in`, and then call it with the arguments.  This is in
-            # lieu of adding a new `input` method on the JVM side, as that would complicate use
-            # from Scala/Java.
-            py4j.java_gateway.get_method(script_java, "in")(key, _py2java(self._sc, val))
-        for val in script._output:
-            script_java.out(val)
-        return MLResults(self._ml.execute(script_java), self._sc)
-
-
-__all__ = ['MLResults', 'MLContext', 'Script', 'dml', 'pydml']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mllearn/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/mllearn/__init__.py b/src/main/python/SystemML/mllearn/__init__.py
deleted file mode 100644
index 69cab58..0000000
--- a/src/main/python/SystemML/mllearn/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from .estimators import *
-
-__all__ = estimators.__all__
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/mllearn/estimators.py b/src/main/python/SystemML/mllearn/estimators.py
deleted file mode 100644
index 5d33d64..0000000
--- a/src/main/python/SystemML/mllearn/estimators.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from pyspark.context import SparkContext 
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
-import numpy as np
-import pandas as pd
-import sklearn as sk
-from pyspark.ml.feature import VectorAssembler
-from pyspark.mllib.linalg import Vectors
-from pyspark.ml import Estimator, Model
-
-from ..converters import *
-
-def assemble(sqlCtx, pdf, inputCols, outputCol):
-    tmpDF = sqlCtx.createDataFrame(pdf, list(pdf.columns))
-    assembler = VectorAssembler(inputCols=list(inputCols), outputCol=outputCol)
-    return assembler.transform(tmpDF)
-
-class BaseSystemMLEstimator(Estimator):
-    featuresCol = 'features'
-    labelCol = 'label'
-    
-    def setFeaturesCol(self, colName):
-        """
-        Sets the default column name for features of PySpark DataFrame.
-        
-        Parameters
-        ----------
-        colName: column name for features (default: 'features')
-        """
-        self.featuresCol = colName
-        
-    def setLabelCol(self, colName):
-        """
-        Sets the default column name for features of PySpark DataFrame.
-        
-        Parameters
-        ----------
-        colName: column name for features (default: 'label')
-        """
-        self.labelCol = colName
-        
-    # Returns a model after calling fit(df) on Estimator object on JVM    
-    def _fit(self, X):
-        """
-        Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame
-        
-        Parameters
-        ----------
-        X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label')
-        """
-        if hasattr(X, '_jdf') and self.featuresCol in X.columns and self.labelCol in X.columns:
-            self.model = self.estimator.fit(X._jdf)
-            return self
-        else:
-            raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
-    
-    def fit(self, X, y=None, params=None):
-        """
-        Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types
-        
-        Parameters
-        ----------
-        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
-        y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
-        """
-        if y is None:
-            return self._fit(X)
-        elif y is not None and isinstance(X, SUPPORTED_TYPES) and isinstance(y, SUPPORTED_TYPES):
-            if self.transferUsingDF:
-                pdfX = convertToPandasDF(X)
-                pdfY = convertToPandasDF(y)
-                if getNumCols(pdfY) != 1:
-                    raise Exception('y should be a column vector')
-                if pdfX.shape[0] != pdfY.shape[0]:
-                    raise Exception('Number of rows of X and y should match')
-                colNames = pdfX.columns
-                pdfX[self.labelCol] = pdfY[pdfY.columns[0]]
-                df = assemble(self.sqlCtx, pdfX, colNames, self.featuresCol).select(self.featuresCol, self.labelCol)
-                self.model = self.estimator.fit(df._jdf)
-            else:
-                numColsy = getNumCols(y)
-                if numColsy != 1:
-                    raise Exception('Expected y to be a column vector')
-                self.model = self.estimator.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y))
-            if self.setOutputRawPredictionsToFalse:
-                self.model.setOutputRawPredictions(False)
-            return self
-        else:
-            raise Exception('Unsupported input type')
-    
-    def transform(self, X):
-        return self.predict(X)
-    
-    # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM    
-    def predict(self, X):
-        """
-        Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types
-        
-        Parameters
-        ----------
-        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
-        """
-        if isinstance(X, SUPPORTED_TYPES):
-            if self.transferUsingDF:
-                pdfX = convertToPandasDF(X)
-                df = assemble(self.sqlCtx, pdfX, pdfX.columns, self.featuresCol).select(self.featuresCol)
-                retjDF = self.model.transform(df._jdf)
-                retDF = DataFrame(retjDF, self.sqlCtx)
-                retPDF = retDF.sort('ID').select('prediction').toPandas()
-                if isinstance(X, np.ndarray):
-                    return retPDF.as_matrix().flatten()
-                else:
-                    return retPDF
-            else:
-                retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
-                if isinstance(X, np.ndarray):
-                    return retNumPy
-                else:
-                    return retNumPy # TODO: Convert to Pandas
-        elif hasattr(X, '_jdf'):
-            if self.featuresCol in X.columns:
-                # No need to assemble as input DF is likely coming via MLPipeline
-                df = X
-            else:
-                assembler = VectorAssembler(inputCols=X.columns, outputCol=self.featuresCol)
-                df = assembler.transform(X)
-            retjDF = self.model.transform(df._jdf)
-            retDF = DataFrame(retjDF, self.sqlCtx)
-            # Return DF
-            return retDF.sort('ID')
-        else:
-            raise Exception('Unsupported input type')
-            
-class BaseSystemMLClassifier(BaseSystemMLEstimator):
-
-    def score(self, X, y):
-        """
-        Scores the predicted value with ground truth 'y'
-        
-        Parameters
-        ----------
-        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
-        y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
-        """
-        return sk.metrics.accuracy_score(y, self.predict(X))    
-
-class BaseSystemMLRegressor(BaseSystemMLEstimator):
-
-    def score(self, X, y):
-        """
-        Scores the predicted value with ground truth 'y'
-        
-        Parameters
-        ----------
-        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
-        y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
-        """
-        return sk.metrics.r2_score(y, self.predict(X), multioutput='variance_weighted')
-
-
-class LogisticRegression(BaseSystemMLClassifier):
-    def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
-        """
-        Performs both binomial and multinomial logistic regression.
-        
-        Parameters
-        ----------
-        sqlCtx: PySpark SQLContext
-        penalty: Only 'l2' supported
-        fit_intercept: Specifies whether to add intercept or not (default: True)
-        max_iter: Maximum number of outer (Fisher scoring) iterations (default: 100)
-        max_inner_iter: Maximum number of inner (conjugate gradient) iterations, or 0 if no maximum limit provided (default: 0)
-        tol: Tolerance used in the convergence criterion (default: 0.000001)
-        C: 1/regularization parameter (default: 1.0)
-        solver: Only 'newton-cg' solver supported
-        """
-        self.sqlCtx = sqlCtx
-        self.sc = sqlCtx._sc
-        self.uid = "logReg"
-        self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc())
-        self.estimator.setMaxOuterIter(max_iter)
-        self.estimator.setMaxInnerIter(max_inner_iter)
-        if C <= 0:
-            raise Exception('C has to be positive')
-        reg = 1.0 / C
-        self.estimator.setRegParam(reg)
-        self.estimator.setTol(tol)
-        self.estimator.setIcpt(int(fit_intercept))
-        self.transferUsingDF = transferUsingDF
-        self.setOutputRawPredictionsToFalse = True
-        if penalty != 'l2':
-            raise Exception('Only l2 penalty is supported')
-        if solver != 'newton-cg':
-            raise Exception('Only newton-cg solver supported')
-
-class LinearRegression(BaseSystemMLRegressor):
-
-    def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
-        """
-        Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables..
-        
-        Parameters
-        ----------
-        sqlCtx: PySpark SQLContext
-        fit_intercept: Specifies whether to add intercept or not (default: True)
-        max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100)
-        tol: Tolerance used in the convergence criterion (default: 0.000001)
-        C: 1/regularization parameter (default: 1.0)
-        solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').  
-        Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient.
-        'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and
-        input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient.
-        """
-        self.sqlCtx = sqlCtx
-        self.sc = sqlCtx._sc
-        self.uid = "lr"
-        if solver == 'newton-cg' or solver == 'direct-solve':
-            self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LinearRegression(self.uid, self.sc._jsc.sc(), solver)
-        else:
-            raise Exception('Only newton-cg solver supported')
-        self.estimator.setMaxIter(max_iter)
-        if C <= 0:
-            raise Exception('C has to be positive')
-        reg = 1.0 / C
-        self.estimator.setRegParam(reg)
-        self.estimator.setTol(tol)
-        self.estimator.setIcpt(int(fit_intercept))
-        self.transferUsingDF = transferUsingDF
-        self.setOutputRawPredictionsToFalse = False
-
-
-class SVM(BaseSystemMLClassifier):
-
-    def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
-        """
-        Performs both binary-class and multiclass SVM (Support Vector Machines).
-        
-        Parameters
-        ----------
-        sqlCtx: PySpark SQLContext
-        fit_intercept: Specifies whether to add intercept or not (default: True)
-        max_iter: Maximum number iterations (default: 100)
-        tol: Tolerance used in the convergence criterion (default: 0.000001)
-        C: 1/regularization parameter (default: 1.0)
-        is_multi_class: Specifies whether to use binary-class SVM or multi-class SVM algorithm (default: False)
-        """
-        self.sqlCtx = sqlCtx
-        self.sc = sqlCtx._sc
-        self.uid = "svm"
-        self.estimator = self.sc._jvm.org.apache.sysml.api.ml.SVM(self.uid, self.sc._jsc.sc(), is_multi_class)
-        self.estimator.setMaxIter(max_iter)
-        if C <= 0:
-            raise Exception('C has to be positive')
-        reg = 1.0 / C
-        self.estimator.setRegParam(reg)
-        self.estimator.setTol(tol)
-        self.estimator.setIcpt(int(fit_intercept))
-        self.transferUsingDF = transferUsingDF
-        self.setOutputRawPredictionsToFalse = False    
-
-class NaiveBayes(BaseSystemMLClassifier):
-
-    def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
-        """
-        Performs both binary-class and multiclass SVM (Support Vector Machines).
-        
-        Parameters
-        ----------
-        sqlCtx: PySpark SQLContext
-        laplace: Laplace smoothing specified by the user to avoid creation of 0 probabilities (default: 1.0)
-        """
-        self.sqlCtx = sqlCtx
-        self.sc = sqlCtx._sc
-        self.uid = "nb"
-        self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc())
-        self.estimator.setLaplace(laplace)
-        self.transferUsingDF = transferUsingDF
-        self.setOutputRawPredictionsToFalse = False
-
-__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/setup.py
----------------------------------------------------------------------
diff --git a/src/main/python/setup.py b/src/main/python/setup.py
index 0bcebab..cc8f373 100644
--- a/src/main/python/setup.py
+++ b/src/main/python/setup.py
@@ -34,7 +34,7 @@ REQUIRED_PACKAGES = [
 ]
 
 PACKAGE_DATA = []
-for path, subdirs, files in os.walk('SystemML/SystemML-java'):
+for path, subdirs, files in os.walk('systemml/systemml-java'):
     for name in files:
         PACKAGE_DATA = PACKAGE_DATA + [ os.path.join(path, name).replace('./', '') ]
         
@@ -61,7 +61,7 @@ setup(
     install_requires=REQUIRED_PACKAGES,
     include_package_data=True,
     package_data={
-        'SystemML-java': PACKAGE_DATA
+        'systemml-java': PACKAGE_DATA
     },
     classifiers=[
         'Intended Audience :: Developers',

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/__init__.py b/src/main/python/systemml/__init__.py
new file mode 100644
index 0000000..02a940b
--- /dev/null
+++ b/src/main/python/systemml/__init__.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from .mlcontext import *
+from .defmatrix import *
+from .converters import *
+
+__all__ = mlcontext.__all__
+__all__ += defmatrix.__all__
+__all__ += converters.__all__
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py
new file mode 100644
index 0000000..9588bec
--- /dev/null
+++ b/src/main/python/systemml/converters.py
@@ -0,0 +1,100 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from pyspark.context import SparkContext 
+from pyspark.sql import DataFrame, SQLContext
+from pyspark.rdd import RDD
+import numpy as np
+import pandas as pd
+import sklearn as sk
+
+from scipy.sparse import spmatrix
+from scipy.sparse import coo_matrix
+
+SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix)
+
+def getNumCols(numPyArr):
+    if numPyArr.ndim == 1:
+        return 1
+    else:
+        return numPyArr.shape[1]
+       
+def convertToLabeledDF(sqlCtx, X, y=None):
+    from pyspark.ml.feature import VectorAssembler
+    if y is not None:
+        pd1 = pd.DataFrame(X)
+        pd2 = pd.DataFrame(y, columns=['label'])
+        pdf = pd.concat([pd1, pd2], axis=1)
+        inputColumns = ['C' + str(i) for i in pd1.columns]
+        outputColumns = inputColumns + ['label']
+    else:
+        pdf = pd.DataFrame(X)
+        inputColumns = ['C' + str(i) for i in pdf.columns]
+        outputColumns = inputColumns
+    assembler = VectorAssembler(inputCols=inputColumns, outputCol='features')
+    out = assembler.transform(sqlCtx.createDataFrame(pdf, outputColumns))
+    if y is not None:
+        return out.select('features', 'label')
+    else:
+        return out.select('features')
+    
+
+def convertToMatrixBlock(sc, src):
+    if isinstance(src, spmatrix):
+        src = coo_matrix(src,  dtype=np.float64)
+        numRows = src.shape[0]
+        numCols = src.shape[1]
+        data = src.data
+        row = src.row.astype(np.int32)
+        col = src.col.astype(np.int32)
+        nnz = len(src.col)
+        buf1 = bytearray(data.tostring())
+        buf2 = bytearray(row.tostring())
+        buf3 = bytearray(col.tostring())
+        return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz)
+    elif isinstance(sc, SparkContext):
+        src = np.asarray(src)
+        numCols = getNumCols(src)
+        numRows = src.shape[0]
+        arr = src.ravel().astype(np.float64)
+        buf = bytearray(arr.tostring())
+        return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
+    else:
+        raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
+    
+
+def convertToNumpyArr(sc, mb):
+    if isinstance(sc, SparkContext):
+        numRows = mb.getNumRows()
+        numCols = mb.getNumColumns()
+        buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb)
+        return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64).reshape((numRows, numCols))
+    else:
+        raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
+
+
+def convertToPandasDF(X):
+    if not isinstance(X, pd.DataFrame):
+        return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))])
+    return X
+    
+__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
new file mode 100644
index 0000000..18f6314
--- /dev/null
+++ b/src/main/python/systemml/defmatrix.py
@@ -0,0 +1,410 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import numpy as np
+
+from . import pydml, MLContext
+from .converters import *
+from pyspark import SparkContext, RDD
+from pyspark.sql import DataFrame, SQLContext
+
+def setSparkContext(sc):
+    """
+    Before using the matrix, the user needs to invoke this function.
+
+    Parameters
+    ----------
+    sc: SparkContext
+        SparkContext
+    """
+    matrix.ml = MLContext(sc)
+    matrix.sc = sc
+
+def checkIfMLContextIsSet():
+    if matrix.ml is None:
+        raise Exception('Expected setSparkContext(sc) to be called.')
+
+class DMLOp(object):
+    """
+    Represents an intermediate node of Abstract syntax tree created to generate the PyDML script
+    """
+    def __init__(self, inputs, dml=None):
+        self.inputs = inputs
+        self.dml = dml
+
+    def _visit(self, execute=True):
+        matrix.dml = matrix.dml + self.dml
+
+
+def reset():
+    """
+    Resets the visited status of matrix and the operators in the generated AST.
+    """
+    for m in matrix.visited:
+        m.visited = False
+    matrix.visited = []
+
+def binaryOp(lhs, rhs, opStr):
+    """
+    Common function called by all the binary operators in matrix class
+    """
+    inputs = []
+    if isinstance(lhs, matrix):
+        lhsStr = lhs.ID
+        inputs = [lhs]
+    elif isinstance(lhs, float) or isinstance(lhs, int):
+        lhsStr = str(lhs)
+    else:
+        raise TypeError('Incorrect type')
+    if isinstance(rhs, matrix):
+        rhsStr = rhs.ID
+        inputs = inputs + [rhs]
+    elif isinstance(rhs, float) or isinstance(rhs, int):
+        rhsStr = str(rhs)
+    else:
+        raise TypeError('Incorrect type')
+    dmlOp = DMLOp(inputs)
+    out = matrix(None, op=dmlOp)
+    dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n']
+    return out
+
+def binaryMatrixFunction(X, Y, fnName):
+    """
+    Common function called by supported PyDML built-in function that has two arguments both of which are matrices.
+    TODO: This needs to be generalized to support arbitrary arguments of differen types.
+    """
+    if not isinstance(X, matrix) or not isinstance(Y, matrix):
+        raise TypeError('Incorrect input type. Expected matrix type')
+    inputs = [X, Y]
+    dmlOp = DMLOp(inputs)
+    out = matrix(None, op=dmlOp)
+    dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n']
+    return out
+
+def solve(A, b):
+    """
+    Computes the least squares solution for system of linear equations A %*% x = b
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import datasets
+    >>> import SystemML as sml
+    >>> from pyspark.sql import SQLContext
+    >>> diabetes = datasets.load_diabetes()
+    >>> diabetes_X = diabetes.data[:, np.newaxis, 2]
+    >>> X_train = diabetes_X[:-20]
+    >>> X_test = diabetes_X[-20:]
+    >>> y_train = diabetes.target[:-20]
+    >>> y_test = diabetes.target[-20:]
+    >>> sml.setSparkContext(sc)
+    >>> X = sml.matrix(X_train)
+    >>> y = sml.matrix(y_train)
+    >>> A = X.transpose().dot(X)
+    >>> b = X.transpose().dot(y)
+    >>> beta = sml.solve(A, b).toNumPyArray()
+    >>> y_predicted = X_test.dot(beta)
+    >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
+    Residual sum of squares: 25282.12
+    """
+    return binaryMatrixFunction(A, b, 'solve')
+
+def eval(outputs, outputDF=False, execute=True):
+    """
+    Executes the unevaluated DML script and computes the matrices specified by outputs.
+
+    Parameters
+    ----------
+    outputs: list of matrices
+    outputDF: back the data of matrix as PySpark DataFrame
+    """
+    checkIfMLContextIsSet()
+    reset()
+    matrix.dml = []
+    matrix.script = pydml('')
+    if isinstance(outputs, matrix):
+        outputs = [ outputs ]
+    elif not isinstance(outputs, list):
+        raise TypeError('Incorrect input type')
+    for m in outputs:
+        m.output = True
+        m._visit(execute=execute)
+    if not execute:
+        return ''.join(matrix.dml)
+    matrix.script.scriptString = ''.join(matrix.dml)
+    results = matrix.ml.execute(matrix.script)
+    # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
+    for m in outputs:
+        if outputDF:
+            m.data = results.getDataFrame(m.ID)
+        else:
+            m.data = results.getNumPyArray(m.ID)
+
+class matrix(object):
+    """
+    matrix class is a python wrapper that implements basic matrix operator.
+    Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
+
+    Examples
+    --------
+    >>> import SystemML as sml
+    >>> import numpy as np
+    >>> sml.setSparkContext(sc)
+
+    Welcome to Apache SystemML!
+
+    >>> m1 = sml.matrix(np.ones((3,3)) + 2)
+    >>> m2 = sml.matrix(np.ones((3,3)) + 3)
+    >>> m2 = m1 * (m2 + m1)
+    >>> m4 = 1.0 - m2
+    >>> m4
+    # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
+    mVar1 = load(" ", format="csv")
+    mVar2 = load(" ", format="csv")
+    mVar3 = mVar2 + mVar1
+    mVar4 = mVar1 * mVar3
+    mVar5 = 1.0 - mVar4
+    save(mVar5, " ")
+
+    <SystemML.defmatrix.matrix object>
+    >>> m2.eval()
+    >>> m2
+    # This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.
+    <SystemML.defmatrix.matrix object>
+    >>> m4
+    # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
+    mVar4 = load(" ", format="csv")
+    mVar5 = 1.0 - mVar4
+    save(mVar5, " ")
+
+    <SystemML.defmatrix.matrix object>
+    >>> m4.sum(axis=1).toNumPyArray()
+    array([[-60.],
+           [-60.],
+           [-60.]])
+    """
+    # Global variable that is used to keep track of intermediate matrix variables in the DML script
+    systemmlVarID = 0
+
+    # Since joining of string is expensive operation, we collect the set of strings into list and then join
+    # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method
+    dml = []
+
+    # Represents MLContext's script object
+    script = None
+
+    # Represents MLContext object
+    ml = None
+
+    # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects
+    # that have been previously evaluated.
+    visited = []
+
+    def __init__(self, data, op=None):
+        """
+        Constructs a lazy matrix
+
+        Parameters
+        ----------
+        data: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame. (data cannot be None for external users, 'data=None' is used internally for lazy evaluation).
+        """
+        checkIfMLContextIsSet()
+        self.visited = False
+        matrix.systemmlVarID += 1
+        self.output = False
+        self.ID = 'mVar' + str(matrix.systemmlVarID)
+        if isinstance(data, SUPPORTED_TYPES):
+            self.data = data
+        elif hasattr(data, '_jdf'):
+            self.data = data
+        elif data is None and op is not None:
+            self.data = None
+            # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation
+            self.op = op
+        else:
+            raise TypeError('Unsupported input type')
+
+    def eval(self, outputDF=False):
+        """
+        This is a convenience function that calls the global eval method
+        """
+        eval([self], outputDF=False)
+
+    def toPandas(self):
+        """
+        This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame.
+        """
+        if self.data is None:
+            self.eval()
+        return convertToPandasDF(self.data)
+
+    def toNumPyArray(self):
+        """
+        This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array.
+        """
+        if self.data is None:
+            self.eval()
+        if isinstance(self.data, DataFrame):
+            self.data = self.data.toPandas().as_matrix()
+        # Always keep default format as NumPy array if possible
+        return self.data
+
+    def toDataFrame(self):
+        """
+        This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame.
+        """
+        if self.data is None:
+            self.eval(outputDF=True)
+        if not isinstance(self.data, DataFrame):
+            if MLResults.sqlContext is None:
+                MLResults.sqlContext = SQLContext(matrix.sc)
+            self.data = sqlContext.createDataFrame(self.toPandas())
+        return self.data
+
+    def _visit(self, execute=True):
+        """
+        This function is called for two scenarios:
+        1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method.
+        2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself
+        and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext.
+        """
+        if self.visited:
+            return self
+        self.visited = True
+        # for cleanup
+        matrix.visited = matrix.visited + [ self ]
+        if self.data is not None:
+            matrix.dml = matrix.dml + [ self.ID,  ' = load(\" \", format=\"csv\")\n']
+            if isinstance(self.data, DataFrame) and execute:
+                matrix.script.input(self.ID, self.data)
+            elif  execute:
+                matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data))
+            return self
+        elif self.op is not None:
+            for m in self.op.inputs:
+                m._visit(execute=execute)
+            self.op._visit(execute=execute)
+        else:
+            raise Exception('Expected either op or data to be set')
+        if self.data is None and self.output:
+            matrix.dml = matrix.dml + ['save(',  self.ID, ', \" \")\n']
+            if execute:
+                matrix.script.output(self.ID)
+        return self
+
+    def __repr__(self):
+        """
+        This function helps to debug matrix class and also examine the generated PyDML script
+        """
+        if self.data is None:
+            print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False))
+        elif isinstance(self.data, DataFrame):
+            print('# This matrix (' + self.ID + ') is backed by PySpark DataFrame. To fetch the DataFrame, invoke toDataFrame() method.')
+        else:
+            print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.')
+        return '<SystemML.defmatrix.matrix object>'
+
+    def __add__(self, other):
+        return binaryOp(self, other, ' + ')
+
+    def __sub__(self, other):
+        return binaryOp(self, other, ' - ')
+
+    def __mul__(self, other):
+        return binaryOp(self, other, ' * ')
+
+    def __floordiv__(self, other):
+        return binaryOp(self, other, ' // ')
+
+    def __div__(self, other):
+        return binaryOp(self, other, ' / ')
+
+    def __mod__(self, other):
+        return binaryOp(self, other, ' % ')
+
+    def __pow__(self, other):
+        return binaryOp(self, other, ' ** ')
+
+    def __radd__(self, other):
+        return binaryOp(other, self, ' + ')
+
+    def __rsub__(self, other):
+        return binaryOp(other, self, ' - ')
+
+    def __rmul__(self, other):
+        return binaryOp(other, self, ' * ')
+
+    def __rfloordiv__(self, other):
+        return binaryOp(other, self, ' // ')
+
+    def __rdiv__(self, other):
+        return binaryOp(other, self, ' / ')
+
+    def __rmod__(self, other):
+        return binaryOp(other, self, ' % ')
+
+    def __rpow__(self, other):
+        return binaryOp(other, self, ' ** ')
+
+    def sum(self, axis=None):
+        return self._aggFn('sum', axis)
+
+    def mean(self, axis=None):
+        return self._aggFn('mean', axis)
+
+    def max(self, axis=None):
+        return self._aggFn('max', axis)
+
+    def min(self, axis=None):
+        return self._aggFn('min', axis)
+
+    def argmin(self, axis=None):
+        return self._aggFn('argmin', axis)
+
+    def argmax(self, axis=None):
+        return self._aggFn('argmax', axis)
+
+    def cumsum(self, axis=None):
+        return self._aggFn('cumsum', axis)
+
+    def transpose(self, axis=None):
+        return self._aggFn('transpose', axis)
+
+    def trace(self, axis=None):
+        return self._aggFn('trace', axis)
+
+    def _aggFn(self, fnName, axis):
+        """
+        Common function that is called for functions that have axis as parameter.
+        """
+        dmlOp = DMLOp([self])
+        out = matrix(None, op=dmlOp)
+        if axis is None:
+            dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ')\n']
+        else:
+            dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n']
+        return out
+
+    def dot(self, other):
+        return binaryMatrixFunction(self, other, 'dot')
+
+__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']