You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/09/02 00:10:52 UTC
[1/2] incubator-systemml git commit: [SYSTEMML-878] Update the Python
package from SystemML to systemml
Repository: incubator-systemml
Updated Branches:
refs/heads/master 7610a21db -> 542de374e
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mlcontext.py b/src/main/python/systemml/mlcontext.py
new file mode 100644
index 0000000..1b90e70
--- /dev/null
+++ b/src/main/python/systemml/mlcontext.py
@@ -0,0 +1,302 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+import os
+
+try:
+ import py4j.java_gateway
+ from py4j.java_gateway import JavaObject
+except ImportError:
+ raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark')
+
+from pyspark import SparkContext
+import pyspark.mllib.common
+from pyspark.sql import DataFrame, SQLContext
+from .converters import *
+
+def dml(scriptString):
+ """
+ Create a dml script object based on a string.
+
+ Parameters
+ ----------
+ scriptString: string
+ Can be a path to a dml script or a dml script itself.
+
+ Returns
+ -------
+ script: Script instance
+ Instance of a script object.
+ """
+ if not isinstance(scriptString, str):
+ raise ValueError("scriptString should be a string, got %s" % type(scriptString))
+ return Script(scriptString, scriptType="dml")
+
+
+def pydml(scriptString):
+ """
+ Create a pydml script object based on a string.
+
+ Parameters
+ ----------
+ scriptString: string
+ Can be a path to a pydml script or a pydml script itself.
+
+ Returns
+ -------
+ script: Script instance
+ Instance of a script object.
+ """
+ if not isinstance(scriptString, str):
+ raise ValueError("scriptString should be a string, got %s" % type(scriptString))
+ return Script(scriptString, scriptType="pydml")
+
+
+def _java2py(sc, obj):
+ """ Convert Java object to Python. """
+ # TODO: Port this private PySpark function.
+ obj = pyspark.mllib.common._java2py(sc, obj)
+ if isinstance(obj, JavaObject):
+ class_name = obj.getClass().getSimpleName()
+ if class_name == 'Matrix':
+ obj = Matrix(obj, sc)
+ return obj
+
+
+def _py2java(sc, obj):
+ """ Convert Python object to Java. """
+ if isinstance(obj, Matrix):
+ obj = obj._java_matrix
+ # TODO: Port this private PySpark function.
+ obj = pyspark.mllib.common._py2java(sc, obj)
+ return obj
+
+
+class Matrix(object):
+ """
+ Wrapper around a Java Matrix object.
+
+ Parameters
+ ----------
+ javaMatrix: JavaObject
+ A Java Matrix object as returned by calling `ml.execute().get()`.
+
+ sc: SparkContext
+ SparkContext
+ """
+ def __init__(self, javaMatrix, sc):
+ self._java_matrix = javaMatrix
+ self.sc = sc
+
+ def __repr__(self):
+ return "Matrix"
+
+ def toDF(self):
+ """
+ Convert the Matrix to a PySpark SQL DataFrame.
+
+ Returns
+ -------
+ df: PySpark SQL DataFrame
+ A PySpark SQL DataFrame representing the matrix, with
+ one "ID" column containing the row index (since Spark
+ DataFrames are unordered), followed by columns of doubles
+ for each column in the matrix.
+ """
+ jdf = self._java_matrix.asDataFrame()
+ df = _java2py(self.sc, jdf)
+ return df
+
+
+class MLResults(object):
+ """
+ Wrapper around a Java ML Results object.
+
+ Parameters
+ ----------
+ results: JavaObject
+ A Java MLResults object as returned by calling `ml.execute()`.
+
+ sc: SparkContext
+ SparkContext
+ """
+ def __init__(self, results, sc):
+ self._java_results = results
+ self.sc = sc
+ try:
+ if MLResults.sqlContext is None:
+ MLResults.sqlContext = SQLContext(sc)
+ except AttributeError:
+ MLResults.sqlContext = SQLContext(sc)
+
+ def __repr__(self):
+ return "MLResults"
+
+ def getNumPyArray(self, *outputs):
+ """
+ Parameters
+ ----------
+ outputs: string, list of strings
+ Output variables as defined inside the DML script.
+ """
+ outs = [convertToNumpyArr(self.sc, self._java_results.getMatrix(out).asBinaryBlockMatrix().getMatrixBlock()) for out in outputs]
+ if len(outs) == 1:
+ return outs[0]
+ return outs
+
+ def getDataFrame(self, *outputs):
+ """
+ Parameters
+ ----------
+ outputs: string, list of strings
+ Output variables as defined inside the DML script.
+ """
+ outs = [DataFrame(self._java_results.getDataFrame(out), MLResults.sqlContext) for out in outputs]
+ if len(outs) == 1:
+ return outs[0]
+ return outs
+
+ def get(self, *outputs):
+ """
+ Parameters
+ ----------
+ outputs: string, list of strings
+ Output variables as defined inside the DML script.
+ """
+ outs = [_java2py(self.sc, self._java_results.get(out)) for out in outputs]
+ if len(outs) == 1:
+ return outs[0]
+ return outs
+
+
+class Script(object):
+ """
+ Instance of a DML/PyDML Script.
+
+ Parameters
+ ----------
+ scriptString: string
+ Can be either a file path to a DML script or a DML script itself.
+
+ scriptType: string
+ Script language, either "dml" for DML (R-like) or "pydml" for PyDML (Python-like).
+ """
+ def __init__(self, scriptString, scriptType="dml"):
+ self.scriptString = scriptString
+ self.scriptType = scriptType
+ self._input = {}
+ self._output = []
+
+ def input(self, *args, **kwargs):
+ """
+ Parameters
+ ----------
+ args: name, value tuple
+ where name is a string, and currently supported value formats
+ are double, string, dataframe, rdd, and list of such object.
+
+ kwargs: dict of name, value pairs
+ To know what formats are supported for name and value, look above.
+ """
+ if args and len(args) != 2:
+ raise ValueError("Expected name, value pair.")
+ elif args:
+ self._input[args[0]] = args[1]
+ for name, value in kwargs.items():
+ self._input[name] = value
+ return self
+
+ def output(self, *names):
+ """
+ Parameters
+ ----------
+ names: string, list of strings
+ Output variables as defined inside the DML script.
+ """
+ self._output.extend(names)
+ return self
+
+
+class MLContext(object):
+ """
+ Wrapper around the new SystemML MLContext.
+
+ Parameters
+ ----------
+ sc: SparkContext
+ SparkContext
+ """
+ def __init__(self, sc):
+ if not isinstance(sc, SparkContext):
+ raise ValueError("Expected sc to be a SparkContext, got " % sc)
+ self._sc = sc
+ self._ml = sc._jvm.org.apache.sysml.api.mlcontext.MLContext(sc._jsc)
+
+ def __repr__(self):
+ return "MLContext"
+
+ def execute(self, script):
+ """
+ Execute a DML / PyDML script.
+
+ Parameters
+ ----------
+ script: Script instance
+ Script instance defined with the appropriate input and output variables.
+
+ Returns
+ -------
+ ml_results: MLResults
+ MLResults instance.
+ """
+ if not isinstance(script, Script):
+ raise ValueError("Expected script to be an instance of Script")
+ scriptString = script.scriptString
+ if script.scriptType == "dml":
+ if scriptString.endswith(".dml"):
+ if os.path.exists(scriptString):
+ script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dmlFromFile(scriptString)
+ else:
+ raise ValueError("path: %s does not exist" % scriptString)
+ else:
+ script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dml(scriptString)
+ elif script.scriptType == "pydml":
+ if scriptString.endswith(".pydml"):
+ if os.path.exists(scriptString):
+ script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydmlFromFile(scriptString)
+ else:
+ raise ValueError("path: %s does not exist" % scriptString)
+ else:
+ script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString)
+
+ for key, val in script._input.items():
+ # `in` is a reserved word ("keyword") in Python, so `script_java.in(...)` is not
+ # allowed. Therefore, we use the following code in which we retrieve a function
+ # representing `script_java.in`, and then call it with the arguments. This is in
+ # lieu of adding a new `input` method on the JVM side, as that would complicate use
+ # from Scala/Java.
+ py4j.java_gateway.get_method(script_java, "in")(key, _py2java(self._sc, val))
+ for val in script._output:
+ script_java.out(val)
+ return MLResults(self._ml.execute(script_java), self._sc)
+
+
+__all__ = ['MLResults', 'MLContext', 'Script', 'dml', 'pydml']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/mllearn/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/__init__.py b/src/main/python/systemml/mllearn/__init__.py
new file mode 100644
index 0000000..69cab58
--- /dev/null
+++ b/src/main/python/systemml/mllearn/__init__.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from .estimators import *
+
+__all__ = estimators.__all__
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
new file mode 100644
index 0000000..5d33d64
--- /dev/null
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -0,0 +1,302 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from pyspark.context import SparkContext
+from pyspark.sql import DataFrame, SQLContext
+from pyspark.rdd import RDD
+import numpy as np
+import pandas as pd
+import sklearn as sk
+from pyspark.ml.feature import VectorAssembler
+from pyspark.mllib.linalg import Vectors
+from pyspark.ml import Estimator, Model
+
+from ..converters import *
+
+def assemble(sqlCtx, pdf, inputCols, outputCol):
+ tmpDF = sqlCtx.createDataFrame(pdf, list(pdf.columns))
+ assembler = VectorAssembler(inputCols=list(inputCols), outputCol=outputCol)
+ return assembler.transform(tmpDF)
+
+class BaseSystemMLEstimator(Estimator):
+ featuresCol = 'features'
+ labelCol = 'label'
+
+ def setFeaturesCol(self, colName):
+ """
+ Sets the default column name for features of PySpark DataFrame.
+
+ Parameters
+ ----------
+ colName: column name for features (default: 'features')
+ """
+ self.featuresCol = colName
+
+ def setLabelCol(self, colName):
+ """
+ Sets the default column name for features of PySpark DataFrame.
+
+ Parameters
+ ----------
+ colName: column name for features (default: 'label')
+ """
+ self.labelCol = colName
+
+ # Returns a model after calling fit(df) on Estimator object on JVM
+ def _fit(self, X):
+ """
+ Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame
+
+ Parameters
+ ----------
+ X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label')
+ """
+ if hasattr(X, '_jdf') and self.featuresCol in X.columns and self.labelCol in X.columns:
+ self.model = self.estimator.fit(X._jdf)
+ return self
+ else:
+ raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
+
+ def fit(self, X, y=None, params=None):
+ """
+ Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types
+
+ Parameters
+ ----------
+ X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+ y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+ """
+ if y is None:
+ return self._fit(X)
+ elif y is not None and isinstance(X, SUPPORTED_TYPES) and isinstance(y, SUPPORTED_TYPES):
+ if self.transferUsingDF:
+ pdfX = convertToPandasDF(X)
+ pdfY = convertToPandasDF(y)
+ if getNumCols(pdfY) != 1:
+ raise Exception('y should be a column vector')
+ if pdfX.shape[0] != pdfY.shape[0]:
+ raise Exception('Number of rows of X and y should match')
+ colNames = pdfX.columns
+ pdfX[self.labelCol] = pdfY[pdfY.columns[0]]
+ df = assemble(self.sqlCtx, pdfX, colNames, self.featuresCol).select(self.featuresCol, self.labelCol)
+ self.model = self.estimator.fit(df._jdf)
+ else:
+ numColsy = getNumCols(y)
+ if numColsy != 1:
+ raise Exception('Expected y to be a column vector')
+ self.model = self.estimator.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y))
+ if self.setOutputRawPredictionsToFalse:
+ self.model.setOutputRawPredictions(False)
+ return self
+ else:
+ raise Exception('Unsupported input type')
+
+ def transform(self, X):
+ return self.predict(X)
+
+ # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM
+ def predict(self, X):
+ """
+ Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types
+
+ Parameters
+ ----------
+ X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
+ """
+ if isinstance(X, SUPPORTED_TYPES):
+ if self.transferUsingDF:
+ pdfX = convertToPandasDF(X)
+ df = assemble(self.sqlCtx, pdfX, pdfX.columns, self.featuresCol).select(self.featuresCol)
+ retjDF = self.model.transform(df._jdf)
+ retDF = DataFrame(retjDF, self.sqlCtx)
+ retPDF = retDF.sort('ID').select('prediction').toPandas()
+ if isinstance(X, np.ndarray):
+ return retPDF.as_matrix().flatten()
+ else:
+ return retPDF
+ else:
+ retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
+ if isinstance(X, np.ndarray):
+ return retNumPy
+ else:
+ return retNumPy # TODO: Convert to Pandas
+ elif hasattr(X, '_jdf'):
+ if self.featuresCol in X.columns:
+ # No need to assemble as input DF is likely coming via MLPipeline
+ df = X
+ else:
+ assembler = VectorAssembler(inputCols=X.columns, outputCol=self.featuresCol)
+ df = assembler.transform(X)
+ retjDF = self.model.transform(df._jdf)
+ retDF = DataFrame(retjDF, self.sqlCtx)
+ # Return DF
+ return retDF.sort('ID')
+ else:
+ raise Exception('Unsupported input type')
+
+class BaseSystemMLClassifier(BaseSystemMLEstimator):
+
+ def score(self, X, y):
+ """
+ Scores the predicted value with ground truth 'y'
+
+ Parameters
+ ----------
+ X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+ y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+ """
+ return sk.metrics.accuracy_score(y, self.predict(X))
+
+class BaseSystemMLRegressor(BaseSystemMLEstimator):
+
+ def score(self, X, y):
+ """
+ Scores the predicted value with ground truth 'y'
+
+ Parameters
+ ----------
+ X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+ y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
+ """
+ return sk.metrics.r2_score(y, self.predict(X), multioutput='variance_weighted')
+
+
+class LogisticRegression(BaseSystemMLClassifier):
+ def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+ """
+ Performs both binomial and multinomial logistic regression.
+
+ Parameters
+ ----------
+ sqlCtx: PySpark SQLContext
+ penalty: Only 'l2' supported
+ fit_intercept: Specifies whether to add intercept or not (default: True)
+ max_iter: Maximum number of outer (Fisher scoring) iterations (default: 100)
+ max_inner_iter: Maximum number of inner (conjugate gradient) iterations, or 0 if no maximum limit provided (default: 0)
+ tol: Tolerance used in the convergence criterion (default: 0.000001)
+ C: 1/regularization parameter (default: 1.0)
+ solver: Only 'newton-cg' solver supported
+ """
+ self.sqlCtx = sqlCtx
+ self.sc = sqlCtx._sc
+ self.uid = "logReg"
+ self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc())
+ self.estimator.setMaxOuterIter(max_iter)
+ self.estimator.setMaxInnerIter(max_inner_iter)
+ if C <= 0:
+ raise Exception('C has to be positive')
+ reg = 1.0 / C
+ self.estimator.setRegParam(reg)
+ self.estimator.setTol(tol)
+ self.estimator.setIcpt(int(fit_intercept))
+ self.transferUsingDF = transferUsingDF
+ self.setOutputRawPredictionsToFalse = True
+ if penalty != 'l2':
+ raise Exception('Only l2 penalty is supported')
+ if solver != 'newton-cg':
+ raise Exception('Only newton-cg solver supported')
+
+class LinearRegression(BaseSystemMLRegressor):
+
+ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
+ """
+ Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables..
+
+ Parameters
+ ----------
+ sqlCtx: PySpark SQLContext
+ fit_intercept: Specifies whether to add intercept or not (default: True)
+ max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100)
+ tol: Tolerance used in the convergence criterion (default: 0.000001)
+ C: 1/regularization parameter (default: 1.0)
+ solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').
+ Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient.
+ 'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and
+ input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient.
+ """
+ self.sqlCtx = sqlCtx
+ self.sc = sqlCtx._sc
+ self.uid = "lr"
+ if solver == 'newton-cg' or solver == 'direct-solve':
+ self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LinearRegression(self.uid, self.sc._jsc.sc(), solver)
+ else:
+ raise Exception('Only newton-cg solver supported')
+ self.estimator.setMaxIter(max_iter)
+ if C <= 0:
+ raise Exception('C has to be positive')
+ reg = 1.0 / C
+ self.estimator.setRegParam(reg)
+ self.estimator.setTol(tol)
+ self.estimator.setIcpt(int(fit_intercept))
+ self.transferUsingDF = transferUsingDF
+ self.setOutputRawPredictionsToFalse = False
+
+
+class SVM(BaseSystemMLClassifier):
+
+ def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
+ """
+ Performs both binary-class and multiclass SVM (Support Vector Machines).
+
+ Parameters
+ ----------
+ sqlCtx: PySpark SQLContext
+ fit_intercept: Specifies whether to add intercept or not (default: True)
+ max_iter: Maximum number iterations (default: 100)
+ tol: Tolerance used in the convergence criterion (default: 0.000001)
+ C: 1/regularization parameter (default: 1.0)
+ is_multi_class: Specifies whether to use binary-class SVM or multi-class SVM algorithm (default: False)
+ """
+ self.sqlCtx = sqlCtx
+ self.sc = sqlCtx._sc
+ self.uid = "svm"
+ self.estimator = self.sc._jvm.org.apache.sysml.api.ml.SVM(self.uid, self.sc._jsc.sc(), is_multi_class)
+ self.estimator.setMaxIter(max_iter)
+ if C <= 0:
+ raise Exception('C has to be positive')
+ reg = 1.0 / C
+ self.estimator.setRegParam(reg)
+ self.estimator.setTol(tol)
+ self.estimator.setIcpt(int(fit_intercept))
+ self.transferUsingDF = transferUsingDF
+ self.setOutputRawPredictionsToFalse = False
+
+class NaiveBayes(BaseSystemMLClassifier):
+
+ def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
+ """
+ Performs both binary-class and multiclass SVM (Support Vector Machines).
+
+ Parameters
+ ----------
+ sqlCtx: PySpark SQLContext
+ laplace: Laplace smoothing specified by the user to avoid creation of 0 probabilities (default: 1.0)
+ """
+ self.sqlCtx = sqlCtx
+ self.sc = sqlCtx._sc
+ self.uid = "nb"
+ self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc())
+ self.estimator.setLaplace(laplace)
+ self.transferUsingDF = transferUsingDF
+ self.setOutputRawPredictionsToFalse = False
+
+__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/tests/test_mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mlcontext.py b/src/main/python/tests/test_mlcontext.py
index 182a4d8..6a6f64e 100644
--- a/src/main/python/tests/test_mlcontext.py
+++ b/src/main/python/tests/test_mlcontext.py
@@ -23,7 +23,7 @@ import unittest
from pyspark.context import SparkContext
-from SystemML import MLContext, dml, pydml
+from systemml import MLContext, dml, pydml
sc = SparkContext()
ml = MLContext(sc)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/tests/test_mllearn.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mllearn.py b/src/main/python/tests/test_mllearn.py
index 22f798f..27b9813 100644
--- a/src/main/python/tests/test_mllearn.py
+++ b/src/main/python/tests/test_mllearn.py
@@ -20,7 +20,7 @@
#
#-------------------------------------------------------------
from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression, LinearRegression, SVM, NaiveBayes
+from systemml.mllearn import LogisticRegression, LinearRegression, SVM, NaiveBayes
from pyspark.sql import SQLContext
from pyspark.context import SparkContext
import unittest
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/uploadToPyPI.sh
----------------------------------------------------------------------
diff --git a/src/main/python/uploadToPyPI.sh b/src/main/python/uploadToPyPI.sh
deleted file mode 100644
index c892f3d..0000000
--- a/src/main/python/uploadToPyPI.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-cd ../../..
-mvn clean package -P distribution
-tar -xzf target/systemml-*-SNAPSHOT.tar.gz -C src/main/python/SystemML
-
-cd src/main/python/SystemML
-mv systemml-*-incubating-SNAPSHOT SystemML-java
-
-cd ..
-echo "Preparing to upload to PyPI ...."
-python setup.py register sdist upload
-
-rm -r SystemML/SystemML-java
\ No newline at end of file
[2/2] incubator-systemml git commit: [SYSTEMML-878] Update the Python
package from SystemML to systemml
Posted by ni...@apache.org.
[SYSTEMML-878] Update the Python package from SystemML to systemml
- Updated Python package name from SystemML to systemml
- Moved uploadToPyPI.sh script to dev/release
- Updated the documentation
Closes #231.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/542de374
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/542de374
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/542de374
Branch: refs/heads/master
Commit: 542de374e19e25e3581de45af06d37a2f4cdaad0
Parents: 7610a21
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu Sep 1 16:55:57 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu Sep 1 16:55:57 2016 -0700
----------------------------------------------------------------------
dev/release/uploadToPyPI.sh | 34 ++
docs/algorithms-classification.md | 18 +-
docs/algorithms-regression.md | 8 +-
docs/beginners-guide-python.md | 49 ++-
src/main/python/MANIFEST.in | 18 +-
src/main/python/SystemML/__init__.py | 29 --
src/main/python/SystemML/converters.py | 100 -----
src/main/python/SystemML/defmatrix.py | 410 --------------------
src/main/python/SystemML/mlcontext.py | 302 --------------
src/main/python/SystemML/mllearn/__init__.py | 25 --
src/main/python/SystemML/mllearn/estimators.py | 302 --------------
src/main/python/setup.py | 4 +-
src/main/python/systemml/__init__.py | 29 ++
src/main/python/systemml/converters.py | 100 +++++
src/main/python/systemml/defmatrix.py | 410 ++++++++++++++++++++
src/main/python/systemml/mlcontext.py | 302 ++++++++++++++
src/main/python/systemml/mllearn/__init__.py | 25 ++
src/main/python/systemml/mllearn/estimators.py | 302 ++++++++++++++
src/main/python/tests/test_mlcontext.py | 2 +-
src/main/python/tests/test_mllearn.py | 2 +-
src/main/python/uploadToPyPI.sh | 34 --
21 files changed, 1265 insertions(+), 1240 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/dev/release/uploadToPyPI.sh
----------------------------------------------------------------------
diff --git a/dev/release/uploadToPyPI.sh b/dev/release/uploadToPyPI.sh
new file mode 100644
index 0000000..44a6b4f
--- /dev/null
+++ b/dev/release/uploadToPyPI.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+cd ../..
+mvn clean package -P distribution
+tar -xzf target/systemml-*-SNAPSHOT.tar.gz -C src/main/python/systemml
+
+cd src/main/python/systemml
+mv systemml-*-incubating-SNAPSHOT systemml-java
+
+cd ..
+echo "Preparing to upload to PyPI ...."
+python setup.py register sdist upload -r https://pypi.python.org/pypi
+
+rm -r systemml/systemml-java
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/algorithms-classification.md
----------------------------------------------------------------------
diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md
index 340267c..8d19d04 100644
--- a/docs/algorithms-classification.md
+++ b/docs/algorithms-classification.md
@@ -129,7 +129,7 @@ Eqs.�(1) and�(2).
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
# C = 1/reg
logistic = LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -237,7 +237,7 @@ SystemML Language Reference for details.
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
@@ -253,7 +253,7 @@ print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_te
# MLPipeline way
from pyspark.ml import Pipeline
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
@@ -498,7 +498,7 @@ support vector machine (`y` with domain size `2`).
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
# C = 1/reg
svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -766,7 +766,7 @@ class labels.
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
# C = 1/reg
svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -916,7 +916,7 @@ SystemML Language Reference for details.
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
@@ -932,7 +932,7 @@ print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y
# MLPipeline way
from pyspark.ml import Pipeline
-from SystemML.mllearn import SVM
+from systemml.mllearn import SVM
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
@@ -1122,7 +1122,7 @@ applicable when all features are counts of categorical values.
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import NaiveBayes
+from systemml.mllearn import NaiveBayes
nb = NaiveBayes(sqlCtx, laplace=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = nb.fit(X_train, y_train)
@@ -1257,7 +1257,7 @@ SystemML Language Reference for details.
{% highlight python %}
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
-from SystemML.mllearn import NaiveBayes
+from systemml.mllearn import NaiveBayes
from sklearn import metrics
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/algorithms-regression.md
----------------------------------------------------------------------
diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md
index 6585b00..992862e 100644
--- a/docs/algorithms-regression.md
+++ b/docs/algorithms-regression.md
@@ -82,7 +82,7 @@ efficient when the number of features $m$ is relatively small
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
# C = 1/reg
lr = LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
@@ -124,7 +124,7 @@ y_test = lr.fit(df_train)
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
# C = 1/reg
lr = LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices
@@ -222,7 +222,7 @@ SystemML Language Reference for details.
{% highlight python %}
import numpy as np
from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
@@ -277,7 +277,7 @@ print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) -
{% highlight python %}
import numpy as np
from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/beginners-guide-python.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md
index 3b4aeed..f040212 100644
--- a/docs/beginners-guide-python.md
+++ b/docs/beginners-guide-python.md
@@ -72,7 +72,7 @@ brew install apache-spark
#### Step 1: Install SystemML Python package
```bash
-pip install SystemML
+pip install systemml
```
#### Step 2: Download SystemML Java binaries
@@ -81,14 +81,14 @@ SystemML Python package downloads the corresponding Java binaries (along with al
into the installed location. To find the location of the downloaded Java binaries, use the following command:
```bash
-python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'
+python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'
```
#### Step 3: (Optional but recommended) Set SYSTEMML_HOME environment variable
<div class="codetabs">
<div data-lang="OSX" markdown="1">
```bash
-SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'`
+SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'`
# If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively.
echo '' >> ~/.bashrc
echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
@@ -96,7 +96,7 @@ echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
</div>
<div data-lang="Linux" markdown="1">
```bash
-SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'`
+SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'`
# If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively.
echo '' >> ~/.bashrc
echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc
@@ -128,7 +128,7 @@ pyspark --master local[*] --driver-class-path $SYSTEMML_HOME"/SystemML.jar"
To get started with SystemML, let's try few elementary matrix multiplication operations:
```python
-import SystemML as sml
+import systemml as sml
import numpy as np
sml.setSparkContext(sc)
m1 = sml.matrix(np.ones((3,3)) + 2)
@@ -152,7 +152,7 @@ model: $ \beta = solve(X^T X, X^T y) $. For simplicity, we will use direct-solve
```python
import numpy as np
from sklearn import datasets
-import SystemML as sml
+import systemml as sml
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
@@ -196,7 +196,7 @@ algorithm.
```python
import numpy as np
from sklearn import datasets
-from SystemML.mllearn import LinearRegression
+from systemml.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
@@ -230,7 +230,7 @@ algorithm on digits datasets.
```python
# Scikit-learn way
from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
@@ -245,15 +245,21 @@ logistic = LogisticRegression(sqlCtx)
print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))
```
+Output:
+
+```bash
+LogisticRegression score: 0.922222
+```
+
### Passing PySpark DataFrame
To train the above algorithm on larger dataset, we can load the dataset into DataFrame and pass it to the `fit` method:
```python
from sklearn import datasets, neighbors
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.sql import SQLContext
-import SystemML as sml
+import systemml as sml
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
X_digits = digits.data
@@ -267,6 +273,12 @@ logistic = LogisticRegression(sqlCtx)
print('LogisticRegression score: %f' % logistic.fit(df_train).score(X_test, y_test))
```
+Output:
+
+```bash
+LogisticRegression score: 0.922222
+```
+
### MLPipeline interface
In the below example, we demonstrate how the same `LogisticRegression` class can allow SystemML to fit seamlessly into
@@ -275,7 +287,7 @@ large data pipelines.
```python
# MLPipeline way
from pyspark.ml import Pipeline
-from SystemML.mllearn import LogisticRegression
+from systemml.mllearn import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
@@ -307,6 +319,19 @@ prediction = model.transform(test)
prediction.show()
```
+Output:
+
+```bash
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+|id| text| words| features| probability| ID|prediction|
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+|12| spark i j k|ArrayBuffer(spark...|(20,[5,6,7],[2.0,...|[0.99999999999975...|1.0| 1.0|
+|13| l m n|ArrayBuffer(l, m, n)|(20,[8,9,10],[1.0...|[1.37552128844736...|2.0| 2.0|
+|14|mapreduce spark|ArrayBuffer(mapre...|(20,[5,10],[1.0,1...|[0.99860290938153...|3.0| 1.0|
+|15| apache hadoop|ArrayBuffer(apach...|(20,[9,14],[1.0,1...|[5.41688748236143...|4.0| 2.0|
++--+---------------+--------------------+--------------------+--------------------+---+----------+
+```
+
## Invoking DML/PyDML scripts using MLContext
The below example demonstrates how to invoke the algorithm [scripts/algorithms/MultiLogReg.dml](https://github.com/apache/incubator-systemml/blob/master/scripts/algorithms/MultiLogReg.dml)
@@ -315,7 +340,7 @@ using Python [MLContext API](https://apache.github.io/incubator-systemml/spark-m
```python
from sklearn import datasets, neighbors
from pyspark.sql import DataFrame, SQLContext
-import SystemML as sml
+import systemml as sml
import pandas as pd
import os
sqlCtx = SQLContext(sc)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/MANIFEST.in
----------------------------------------------------------------------
diff --git a/src/main/python/MANIFEST.in b/src/main/python/MANIFEST.in
index a185263..a0835d2 100644
--- a/src/main/python/MANIFEST.in
+++ b/src/main/python/MANIFEST.in
@@ -18,12 +18,12 @@
# under the License.
#
#-------------------------------------------------------------
-include SystemML/SystemML-java/LICENSE
-include SystemML/SystemML-java/SystemML-config.xml
-include SystemML/SystemML-java/NOTICE
-include SystemML/SystemML-java/SystemML.jar
-include SystemML/SystemML-java/DISCLAIMER
-include SystemML/SystemML-java/scripts/sparkDML.sh
-recursive-include SystemML/SystemML-java/scripts/algorithms *
-recursive-include SystemML/SystemML-java/scripts/datagen *
-recursive-include SystemML/SystemML-java/scripts/utils *
\ No newline at end of file
+include systemml/systemml-java/LICENSE
+include systemml/systemml-java/SystemML-config.xml
+include systemml/systemml-java/NOTICE
+include systemml/systemml-java/SystemML.jar
+include systemml/systemml-java/DISCLAIMER
+include systemml/systemml-java/scripts/sparkDML.sh
+recursive-include systemml/systemml-java/scripts/algorithms *
+recursive-include systemml/systemml-java/scripts/datagen *
+recursive-include systemml/systemml-java/scripts/utils *
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/__init__.py b/src/main/python/SystemML/__init__.py
deleted file mode 100644
index 02a940b..0000000
--- a/src/main/python/SystemML/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from .mlcontext import *
-from .defmatrix import *
-from .converters import *
-
-__all__ = mlcontext.__all__
-__all__ += defmatrix.__all__
-__all__ += converters.__all__
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/converters.py b/src/main/python/SystemML/converters.py
deleted file mode 100644
index 9588bec..0000000
--- a/src/main/python/SystemML/converters.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from pyspark.context import SparkContext
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
-import numpy as np
-import pandas as pd
-import sklearn as sk
-
-from scipy.sparse import spmatrix
-from scipy.sparse import coo_matrix
-
-SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix)
-
-def getNumCols(numPyArr):
- if numPyArr.ndim == 1:
- return 1
- else:
- return numPyArr.shape[1]
-
-def convertToLabeledDF(sqlCtx, X, y=None):
- from pyspark.ml.feature import VectorAssembler
- if y is not None:
- pd1 = pd.DataFrame(X)
- pd2 = pd.DataFrame(y, columns=['label'])
- pdf = pd.concat([pd1, pd2], axis=1)
- inputColumns = ['C' + str(i) for i in pd1.columns]
- outputColumns = inputColumns + ['label']
- else:
- pdf = pd.DataFrame(X)
- inputColumns = ['C' + str(i) for i in pdf.columns]
- outputColumns = inputColumns
- assembler = VectorAssembler(inputCols=inputColumns, outputCol='features')
- out = assembler.transform(sqlCtx.createDataFrame(pdf, outputColumns))
- if y is not None:
- return out.select('features', 'label')
- else:
- return out.select('features')
-
-
-def convertToMatrixBlock(sc, src):
- if isinstance(src, spmatrix):
- src = coo_matrix(src, dtype=np.float64)
- numRows = src.shape[0]
- numCols = src.shape[1]
- data = src.data
- row = src.row.astype(np.int32)
- col = src.col.astype(np.int32)
- nnz = len(src.col)
- buf1 = bytearray(data.tostring())
- buf2 = bytearray(row.tostring())
- buf3 = bytearray(col.tostring())
- return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz)
- elif isinstance(sc, SparkContext):
- src = np.asarray(src)
- numCols = getNumCols(src)
- numRows = src.shape[0]
- arr = src.ravel().astype(np.float64)
- buf = bytearray(arr.tostring())
- return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
- else:
- raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
-
-
-def convertToNumpyArr(sc, mb):
- if isinstance(sc, SparkContext):
- numRows = mb.getNumRows()
- numCols = mb.getNumColumns()
- buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb)
- return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64).reshape((numRows, numCols))
- else:
- raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
-
-
-def convertToPandasDF(X):
- if not isinstance(X, pd.DataFrame):
- return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))])
- return X
-
-__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/defmatrix.py b/src/main/python/SystemML/defmatrix.py
deleted file mode 100644
index 18f6314..0000000
--- a/src/main/python/SystemML/defmatrix.py
+++ /dev/null
@@ -1,410 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-import numpy as np
-
-from . import pydml, MLContext
-from .converters import *
-from pyspark import SparkContext, RDD
-from pyspark.sql import DataFrame, SQLContext
-
-def setSparkContext(sc):
- """
- Before using the matrix, the user needs to invoke this function.
-
- Parameters
- ----------
- sc: SparkContext
- SparkContext
- """
- matrix.ml = MLContext(sc)
- matrix.sc = sc
-
-def checkIfMLContextIsSet():
- if matrix.ml is None:
- raise Exception('Expected setSparkContext(sc) to be called.')
-
-class DMLOp(object):
- """
- Represents an intermediate node of Abstract syntax tree created to generate the PyDML script
- """
- def __init__(self, inputs, dml=None):
- self.inputs = inputs
- self.dml = dml
-
- def _visit(self, execute=True):
- matrix.dml = matrix.dml + self.dml
-
-
-def reset():
- """
- Resets the visited status of matrix and the operators in the generated AST.
- """
- for m in matrix.visited:
- m.visited = False
- matrix.visited = []
-
-def binaryOp(lhs, rhs, opStr):
- """
- Common function called by all the binary operators in matrix class
- """
- inputs = []
- if isinstance(lhs, matrix):
- lhsStr = lhs.ID
- inputs = [lhs]
- elif isinstance(lhs, float) or isinstance(lhs, int):
- lhsStr = str(lhs)
- else:
- raise TypeError('Incorrect type')
- if isinstance(rhs, matrix):
- rhsStr = rhs.ID
- inputs = inputs + [rhs]
- elif isinstance(rhs, float) or isinstance(rhs, int):
- rhsStr = str(rhs)
- else:
- raise TypeError('Incorrect type')
- dmlOp = DMLOp(inputs)
- out = matrix(None, op=dmlOp)
- dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n']
- return out
-
-def binaryMatrixFunction(X, Y, fnName):
- """
- Common function called by supported PyDML built-in function that has two arguments both of which are matrices.
- TODO: This needs to be generalized to support arbitrary arguments of differen types.
- """
- if not isinstance(X, matrix) or not isinstance(Y, matrix):
- raise TypeError('Incorrect input type. Expected matrix type')
- inputs = [X, Y]
- dmlOp = DMLOp(inputs)
- out = matrix(None, op=dmlOp)
- dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n']
- return out
-
-def solve(A, b):
- """
- Computes the least squares solution for system of linear equations A %*% x = b
-
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn import datasets
- >>> import SystemML as sml
- >>> from pyspark.sql import SQLContext
- >>> diabetes = datasets.load_diabetes()
- >>> diabetes_X = diabetes.data[:, np.newaxis, 2]
- >>> X_train = diabetes_X[:-20]
- >>> X_test = diabetes_X[-20:]
- >>> y_train = diabetes.target[:-20]
- >>> y_test = diabetes.target[-20:]
- >>> sml.setSparkContext(sc)
- >>> X = sml.matrix(X_train)
- >>> y = sml.matrix(y_train)
- >>> A = X.transpose().dot(X)
- >>> b = X.transpose().dot(y)
- >>> beta = sml.solve(A, b).toNumPyArray()
- >>> y_predicted = X_test.dot(beta)
- >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
- Residual sum of squares: 25282.12
- """
- return binaryMatrixFunction(A, b, 'solve')
-
-def eval(outputs, outputDF=False, execute=True):
- """
- Executes the unevaluated DML script and computes the matrices specified by outputs.
-
- Parameters
- ----------
- outputs: list of matrices
- outputDF: back the data of matrix as PySpark DataFrame
- """
- checkIfMLContextIsSet()
- reset()
- matrix.dml = []
- matrix.script = pydml('')
- if isinstance(outputs, matrix):
- outputs = [ outputs ]
- elif not isinstance(outputs, list):
- raise TypeError('Incorrect input type')
- for m in outputs:
- m.output = True
- m._visit(execute=execute)
- if not execute:
- return ''.join(matrix.dml)
- matrix.script.scriptString = ''.join(matrix.dml)
- results = matrix.ml.execute(matrix.script)
- # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
- for m in outputs:
- if outputDF:
- m.data = results.getDataFrame(m.ID)
- else:
- m.data = results.getNumPyArray(m.ID)
-
-class matrix(object):
- """
- matrix class is a python wrapper that implements basic matrix operator.
- Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
-
- Examples
- --------
- >>> import SystemML as sml
- >>> import numpy as np
- >>> sml.setSparkContext(sc)
-
- Welcome to Apache SystemML!
-
- >>> m1 = sml.matrix(np.ones((3,3)) + 2)
- >>> m2 = sml.matrix(np.ones((3,3)) + 3)
- >>> m2 = m1 * (m2 + m1)
- >>> m4 = 1.0 - m2
- >>> m4
- # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
- mVar1 = load(" ", format="csv")
- mVar2 = load(" ", format="csv")
- mVar3 = mVar2 + mVar1
- mVar4 = mVar1 * mVar3
- mVar5 = 1.0 - mVar4
- save(mVar5, " ")
-
- <SystemML.defmatrix.matrix object>
- >>> m2.eval()
- >>> m2
- # This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.
- <SystemML.defmatrix.matrix object>
- >>> m4
- # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
- mVar4 = load(" ", format="csv")
- mVar5 = 1.0 - mVar4
- save(mVar5, " ")
-
- <SystemML.defmatrix.matrix object>
- >>> m4.sum(axis=1).toNumPyArray()
- array([[-60.],
- [-60.],
- [-60.]])
- """
- # Global variable that is used to keep track of intermediate matrix variables in the DML script
- systemmlVarID = 0
-
- # Since joining of string is expensive operation, we collect the set of strings into list and then join
- # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method
- dml = []
-
- # Represents MLContext's script object
- script = None
-
- # Represents MLContext object
- ml = None
-
- # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects
- # that have been previously evaluated.
- visited = []
-
- def __init__(self, data, op=None):
- """
- Constructs a lazy matrix
-
- Parameters
- ----------
- data: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame. (data cannot be None for external users, 'data=None' is used internally for lazy evaluation).
- """
- checkIfMLContextIsSet()
- self.visited = False
- matrix.systemmlVarID += 1
- self.output = False
- self.ID = 'mVar' + str(matrix.systemmlVarID)
- if isinstance(data, SUPPORTED_TYPES):
- self.data = data
- elif hasattr(data, '_jdf'):
- self.data = data
- elif data is None and op is not None:
- self.data = None
- # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation
- self.op = op
- else:
- raise TypeError('Unsupported input type')
-
- def eval(self, outputDF=False):
- """
- This is a convenience function that calls the global eval method
- """
- eval([self], outputDF=False)
-
- def toPandas(self):
- """
- This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame.
- """
- if self.data is None:
- self.eval()
- return convertToPandasDF(self.data)
-
- def toNumPyArray(self):
- """
- This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array.
- """
- if self.data is None:
- self.eval()
- if isinstance(self.data, DataFrame):
- self.data = self.data.toPandas().as_matrix()
- # Always keep default format as NumPy array if possible
- return self.data
-
- def toDataFrame(self):
- """
- This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame.
- """
- if self.data is None:
- self.eval(outputDF=True)
- if not isinstance(self.data, DataFrame):
- if MLResults.sqlContext is None:
- MLResults.sqlContext = SQLContext(matrix.sc)
- self.data = sqlContext.createDataFrame(self.toPandas())
- return self.data
-
- def _visit(self, execute=True):
- """
- This function is called for two scenarios:
- 1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method.
- 2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself
- and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext.
- """
- if self.visited:
- return self
- self.visited = True
- # for cleanup
- matrix.visited = matrix.visited + [ self ]
- if self.data is not None:
- matrix.dml = matrix.dml + [ self.ID, ' = load(\" \", format=\"csv\")\n']
- if isinstance(self.data, DataFrame) and execute:
- matrix.script.input(self.ID, self.data)
- elif execute:
- matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data))
- return self
- elif self.op is not None:
- for m in self.op.inputs:
- m._visit(execute=execute)
- self.op._visit(execute=execute)
- else:
- raise Exception('Expected either op or data to be set')
- if self.data is None and self.output:
- matrix.dml = matrix.dml + ['save(', self.ID, ', \" \")\n']
- if execute:
- matrix.script.output(self.ID)
- return self
-
- def __repr__(self):
- """
- This function helps to debug matrix class and also examine the generated PyDML script
- """
- if self.data is None:
- print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False))
- elif isinstance(self.data, DataFrame):
- print('# This matrix (' + self.ID + ') is backed by PySpark DataFrame. To fetch the DataFrame, invoke toDataFrame() method.')
- else:
- print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.')
- return '<SystemML.defmatrix.matrix object>'
-
- def __add__(self, other):
- return binaryOp(self, other, ' + ')
-
- def __sub__(self, other):
- return binaryOp(self, other, ' - ')
-
- def __mul__(self, other):
- return binaryOp(self, other, ' * ')
-
- def __floordiv__(self, other):
- return binaryOp(self, other, ' // ')
-
- def __div__(self, other):
- return binaryOp(self, other, ' / ')
-
- def __mod__(self, other):
- return binaryOp(self, other, ' % ')
-
- def __pow__(self, other):
- return binaryOp(self, other, ' ** ')
-
- def __radd__(self, other):
- return binaryOp(other, self, ' + ')
-
- def __rsub__(self, other):
- return binaryOp(other, self, ' - ')
-
- def __rmul__(self, other):
- return binaryOp(other, self, ' * ')
-
- def __rfloordiv__(self, other):
- return binaryOp(other, self, ' // ')
-
- def __rdiv__(self, other):
- return binaryOp(other, self, ' / ')
-
- def __rmod__(self, other):
- return binaryOp(other, self, ' % ')
-
- def __rpow__(self, other):
- return binaryOp(other, self, ' ** ')
-
- def sum(self, axis=None):
- return self._aggFn('sum', axis)
-
- def mean(self, axis=None):
- return self._aggFn('mean', axis)
-
- def max(self, axis=None):
- return self._aggFn('max', axis)
-
- def min(self, axis=None):
- return self._aggFn('min', axis)
-
- def argmin(self, axis=None):
- return self._aggFn('argmin', axis)
-
- def argmax(self, axis=None):
- return self._aggFn('argmax', axis)
-
- def cumsum(self, axis=None):
- return self._aggFn('cumsum', axis)
-
- def transpose(self, axis=None):
- return self._aggFn('transpose', axis)
-
- def trace(self, axis=None):
- return self._aggFn('trace', axis)
-
- def _aggFn(self, fnName, axis):
- """
- Common function that is called for functions that have axis as parameter.
- """
- dmlOp = DMLOp([self])
- out = matrix(None, op=dmlOp)
- if axis is None:
- dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ')\n']
- else:
- dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n']
- return out
-
- def dot(self, other):
- return binaryMatrixFunction(self, other, 'dot')
-
-__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/mlcontext.py b/src/main/python/SystemML/mlcontext.py
deleted file mode 100644
index 1b90e70..0000000
--- a/src/main/python/SystemML/mlcontext.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-import os
-
-try:
- import py4j.java_gateway
- from py4j.java_gateway import JavaObject
-except ImportError:
- raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark')
-
-from pyspark import SparkContext
-import pyspark.mllib.common
-from pyspark.sql import DataFrame, SQLContext
-from .converters import *
-
-def dml(scriptString):
- """
- Create a dml script object based on a string.
-
- Parameters
- ----------
- scriptString: string
- Can be a path to a dml script or a dml script itself.
-
- Returns
- -------
- script: Script instance
- Instance of a script object.
- """
- if not isinstance(scriptString, str):
- raise ValueError("scriptString should be a string, got %s" % type(scriptString))
- return Script(scriptString, scriptType="dml")
-
-
-def pydml(scriptString):
- """
- Create a pydml script object based on a string.
-
- Parameters
- ----------
- scriptString: string
- Can be a path to a pydml script or a pydml script itself.
-
- Returns
- -------
- script: Script instance
- Instance of a script object.
- """
- if not isinstance(scriptString, str):
- raise ValueError("scriptString should be a string, got %s" % type(scriptString))
- return Script(scriptString, scriptType="pydml")
-
-
-def _java2py(sc, obj):
- """ Convert Java object to Python. """
- # TODO: Port this private PySpark function.
- obj = pyspark.mllib.common._java2py(sc, obj)
- if isinstance(obj, JavaObject):
- class_name = obj.getClass().getSimpleName()
- if class_name == 'Matrix':
- obj = Matrix(obj, sc)
- return obj
-
-
-def _py2java(sc, obj):
- """ Convert Python object to Java. """
- if isinstance(obj, Matrix):
- obj = obj._java_matrix
- # TODO: Port this private PySpark function.
- obj = pyspark.mllib.common._py2java(sc, obj)
- return obj
-
-
-class Matrix(object):
- """
- Wrapper around a Java Matrix object.
-
- Parameters
- ----------
- javaMatrix: JavaObject
- A Java Matrix object as returned by calling `ml.execute().get()`.
-
- sc: SparkContext
- SparkContext
- """
- def __init__(self, javaMatrix, sc):
- self._java_matrix = javaMatrix
- self.sc = sc
-
- def __repr__(self):
- return "Matrix"
-
- def toDF(self):
- """
- Convert the Matrix to a PySpark SQL DataFrame.
-
- Returns
- -------
- df: PySpark SQL DataFrame
- A PySpark SQL DataFrame representing the matrix, with
- one "ID" column containing the row index (since Spark
- DataFrames are unordered), followed by columns of doubles
- for each column in the matrix.
- """
- jdf = self._java_matrix.asDataFrame()
- df = _java2py(self.sc, jdf)
- return df
-
-
-class MLResults(object):
- """
- Wrapper around a Java ML Results object.
-
- Parameters
- ----------
- results: JavaObject
- A Java MLResults object as returned by calling `ml.execute()`.
-
- sc: SparkContext
- SparkContext
- """
- def __init__(self, results, sc):
- self._java_results = results
- self.sc = sc
- try:
- if MLResults.sqlContext is None:
- MLResults.sqlContext = SQLContext(sc)
- except AttributeError:
- MLResults.sqlContext = SQLContext(sc)
-
- def __repr__(self):
- return "MLResults"
-
- def getNumPyArray(self, *outputs):
- """
- Parameters
- ----------
- outputs: string, list of strings
- Output variables as defined inside the DML script.
- """
- outs = [convertToNumpyArr(self.sc, self._java_results.getMatrix(out).asBinaryBlockMatrix().getMatrixBlock()) for out in outputs]
- if len(outs) == 1:
- return outs[0]
- return outs
-
- def getDataFrame(self, *outputs):
- """
- Parameters
- ----------
- outputs: string, list of strings
- Output variables as defined inside the DML script.
- """
- outs = [DataFrame(self._java_results.getDataFrame(out), MLResults.sqlContext) for out in outputs]
- if len(outs) == 1:
- return outs[0]
- return outs
-
- def get(self, *outputs):
- """
- Parameters
- ----------
- outputs: string, list of strings
- Output variables as defined inside the DML script.
- """
- outs = [_java2py(self.sc, self._java_results.get(out)) for out in outputs]
- if len(outs) == 1:
- return outs[0]
- return outs
-
-
-class Script(object):
- """
- Instance of a DML/PyDML Script.
-
- Parameters
- ----------
- scriptString: string
- Can be either a file path to a DML script or a DML script itself.
-
- scriptType: string
- Script language, either "dml" for DML (R-like) or "pydml" for PyDML (Python-like).
- """
- def __init__(self, scriptString, scriptType="dml"):
- self.scriptString = scriptString
- self.scriptType = scriptType
- self._input = {}
- self._output = []
-
- def input(self, *args, **kwargs):
- """
- Parameters
- ----------
- args: name, value tuple
- where name is a string, and currently supported value formats
- are double, string, dataframe, rdd, and list of such object.
-
- kwargs: dict of name, value pairs
- To know what formats are supported for name and value, look above.
- """
- if args and len(args) != 2:
- raise ValueError("Expected name, value pair.")
- elif args:
- self._input[args[0]] = args[1]
- for name, value in kwargs.items():
- self._input[name] = value
- return self
-
- def output(self, *names):
- """
- Parameters
- ----------
- names: string, list of strings
- Output variables as defined inside the DML script.
- """
- self._output.extend(names)
- return self
-
-
-class MLContext(object):
- """
- Wrapper around the new SystemML MLContext.
-
- Parameters
- ----------
- sc: SparkContext
- SparkContext
- """
- def __init__(self, sc):
- if not isinstance(sc, SparkContext):
- raise ValueError("Expected sc to be a SparkContext, got " % sc)
- self._sc = sc
- self._ml = sc._jvm.org.apache.sysml.api.mlcontext.MLContext(sc._jsc)
-
- def __repr__(self):
- return "MLContext"
-
- def execute(self, script):
- """
- Execute a DML / PyDML script.
-
- Parameters
- ----------
- script: Script instance
- Script instance defined with the appropriate input and output variables.
-
- Returns
- -------
- ml_results: MLResults
- MLResults instance.
- """
- if not isinstance(script, Script):
- raise ValueError("Expected script to be an instance of Script")
- scriptString = script.scriptString
- if script.scriptType == "dml":
- if scriptString.endswith(".dml"):
- if os.path.exists(scriptString):
- script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dmlFromFile(scriptString)
- else:
- raise ValueError("path: %s does not exist" % scriptString)
- else:
- script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dml(scriptString)
- elif script.scriptType == "pydml":
- if scriptString.endswith(".pydml"):
- if os.path.exists(scriptString):
- script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydmlFromFile(scriptString)
- else:
- raise ValueError("path: %s does not exist" % scriptString)
- else:
- script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString)
-
- for key, val in script._input.items():
- # `in` is a reserved word ("keyword") in Python, so `script_java.in(...)` is not
- # allowed. Therefore, we use the following code in which we retrieve a function
- # representing `script_java.in`, and then call it with the arguments. This is in
- # lieu of adding a new `input` method on the JVM side, as that would complicate use
- # from Scala/Java.
- py4j.java_gateway.get_method(script_java, "in")(key, _py2java(self._sc, val))
- for val in script._output:
- script_java.out(val)
- return MLResults(self._ml.execute(script_java), self._sc)
-
-
-__all__ = ['MLResults', 'MLContext', 'Script', 'dml', 'pydml']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mllearn/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/mllearn/__init__.py b/src/main/python/SystemML/mllearn/__init__.py
deleted file mode 100644
index 69cab58..0000000
--- a/src/main/python/SystemML/mllearn/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from .estimators import *
-
-__all__ = estimators.__all__
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML/mllearn/estimators.py b/src/main/python/SystemML/mllearn/estimators.py
deleted file mode 100644
index 5d33d64..0000000
--- a/src/main/python/SystemML/mllearn/estimators.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#!/usr/bin/python
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-from pyspark.context import SparkContext
-from pyspark.sql import DataFrame, SQLContext
-from pyspark.rdd import RDD
-import numpy as np
-import pandas as pd
-import sklearn as sk
-from pyspark.ml.feature import VectorAssembler
-from pyspark.mllib.linalg import Vectors
-from pyspark.ml import Estimator, Model
-
-from ..converters import *
-
-def assemble(sqlCtx, pdf, inputCols, outputCol):
- tmpDF = sqlCtx.createDataFrame(pdf, list(pdf.columns))
- assembler = VectorAssembler(inputCols=list(inputCols), outputCol=outputCol)
- return assembler.transform(tmpDF)
-
-class BaseSystemMLEstimator(Estimator):
- featuresCol = 'features'
- labelCol = 'label'
-
- def setFeaturesCol(self, colName):
- """
- Sets the default column name for features of PySpark DataFrame.
-
- Parameters
- ----------
- colName: column name for features (default: 'features')
- """
- self.featuresCol = colName
-
- def setLabelCol(self, colName):
- """
- Sets the default column name for features of PySpark DataFrame.
-
- Parameters
- ----------
- colName: column name for features (default: 'label')
- """
- self.labelCol = colName
-
- # Returns a model after calling fit(df) on Estimator object on JVM
- def _fit(self, X):
- """
- Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame
-
- Parameters
- ----------
- X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label')
- """
- if hasattr(X, '_jdf') and self.featuresCol in X.columns and self.labelCol in X.columns:
- self.model = self.estimator.fit(X._jdf)
- return self
- else:
- raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns')
-
- def fit(self, X, y=None, params=None):
- """
- Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types
-
- Parameters
- ----------
- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
- y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
- """
- if y is None:
- return self._fit(X)
- elif y is not None and isinstance(X, SUPPORTED_TYPES) and isinstance(y, SUPPORTED_TYPES):
- if self.transferUsingDF:
- pdfX = convertToPandasDF(X)
- pdfY = convertToPandasDF(y)
- if getNumCols(pdfY) != 1:
- raise Exception('y should be a column vector')
- if pdfX.shape[0] != pdfY.shape[0]:
- raise Exception('Number of rows of X and y should match')
- colNames = pdfX.columns
- pdfX[self.labelCol] = pdfY[pdfY.columns[0]]
- df = assemble(self.sqlCtx, pdfX, colNames, self.featuresCol).select(self.featuresCol, self.labelCol)
- self.model = self.estimator.fit(df._jdf)
- else:
- numColsy = getNumCols(y)
- if numColsy != 1:
- raise Exception('Expected y to be a column vector')
- self.model = self.estimator.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y))
- if self.setOutputRawPredictionsToFalse:
- self.model.setOutputRawPredictions(False)
- return self
- else:
- raise Exception('Unsupported input type')
-
- def transform(self, X):
- return self.predict(X)
-
- # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM
- def predict(self, X):
- """
- Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types
-
- Parameters
- ----------
- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
- """
- if isinstance(X, SUPPORTED_TYPES):
- if self.transferUsingDF:
- pdfX = convertToPandasDF(X)
- df = assemble(self.sqlCtx, pdfX, pdfX.columns, self.featuresCol).select(self.featuresCol)
- retjDF = self.model.transform(df._jdf)
- retDF = DataFrame(retjDF, self.sqlCtx)
- retPDF = retDF.sort('ID').select('prediction').toPandas()
- if isinstance(X, np.ndarray):
- return retPDF.as_matrix().flatten()
- else:
- return retPDF
- else:
- retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
- if isinstance(X, np.ndarray):
- return retNumPy
- else:
- return retNumPy # TODO: Convert to Pandas
- elif hasattr(X, '_jdf'):
- if self.featuresCol in X.columns:
- # No need to assemble as input DF is likely coming via MLPipeline
- df = X
- else:
- assembler = VectorAssembler(inputCols=X.columns, outputCol=self.featuresCol)
- df = assembler.transform(X)
- retjDF = self.model.transform(df._jdf)
- retDF = DataFrame(retjDF, self.sqlCtx)
- # Return DF
- return retDF.sort('ID')
- else:
- raise Exception('Unsupported input type')
-
-class BaseSystemMLClassifier(BaseSystemMLEstimator):
-
- def score(self, X, y):
- """
- Scores the predicted value with ground truth 'y'
-
- Parameters
- ----------
- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
- y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
- """
- return sk.metrics.accuracy_score(y, self.predict(X))
-
-class BaseSystemMLRegressor(BaseSystemMLEstimator):
-
- def score(self, X, y):
- """
- Scores the predicted value with ground truth 'y'
-
- Parameters
- ----------
- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
- y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix
- """
- return sk.metrics.r2_score(y, self.predict(X), multioutput='variance_weighted')
-
-
-class LogisticRegression(BaseSystemMLClassifier):
- def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
- """
- Performs both binomial and multinomial logistic regression.
-
- Parameters
- ----------
- sqlCtx: PySpark SQLContext
- penalty: Only 'l2' supported
- fit_intercept: Specifies whether to add intercept or not (default: True)
- max_iter: Maximum number of outer (Fisher scoring) iterations (default: 100)
- max_inner_iter: Maximum number of inner (conjugate gradient) iterations, or 0 if no maximum limit provided (default: 0)
- tol: Tolerance used in the convergence criterion (default: 0.000001)
- C: 1/regularization parameter (default: 1.0)
- solver: Only 'newton-cg' solver supported
- """
- self.sqlCtx = sqlCtx
- self.sc = sqlCtx._sc
- self.uid = "logReg"
- self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc())
- self.estimator.setMaxOuterIter(max_iter)
- self.estimator.setMaxInnerIter(max_inner_iter)
- if C <= 0:
- raise Exception('C has to be positive')
- reg = 1.0 / C
- self.estimator.setRegParam(reg)
- self.estimator.setTol(tol)
- self.estimator.setIcpt(int(fit_intercept))
- self.transferUsingDF = transferUsingDF
- self.setOutputRawPredictionsToFalse = True
- if penalty != 'l2':
- raise Exception('Only l2 penalty is supported')
- if solver != 'newton-cg':
- raise Exception('Only newton-cg solver supported')
-
-class LinearRegression(BaseSystemMLRegressor):
-
- def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False):
- """
- Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables..
-
- Parameters
- ----------
- sqlCtx: PySpark SQLContext
- fit_intercept: Specifies whether to add intercept or not (default: True)
- max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100)
- tol: Tolerance used in the convergence criterion (default: 0.000001)
- C: 1/regularization parameter (default: 1.0)
- solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg').
- Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient.
- 'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and
- input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient.
- """
- self.sqlCtx = sqlCtx
- self.sc = sqlCtx._sc
- self.uid = "lr"
- if solver == 'newton-cg' or solver == 'direct-solve':
- self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LinearRegression(self.uid, self.sc._jsc.sc(), solver)
- else:
- raise Exception('Only newton-cg solver supported')
- self.estimator.setMaxIter(max_iter)
- if C <= 0:
- raise Exception('C has to be positive')
- reg = 1.0 / C
- self.estimator.setRegParam(reg)
- self.estimator.setTol(tol)
- self.estimator.setIcpt(int(fit_intercept))
- self.transferUsingDF = transferUsingDF
- self.setOutputRawPredictionsToFalse = False
-
-
-class SVM(BaseSystemMLClassifier):
-
- def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False):
- """
- Performs both binary-class and multiclass SVM (Support Vector Machines).
-
- Parameters
- ----------
- sqlCtx: PySpark SQLContext
- fit_intercept: Specifies whether to add intercept or not (default: True)
- max_iter: Maximum number iterations (default: 100)
- tol: Tolerance used in the convergence criterion (default: 0.000001)
- C: 1/regularization parameter (default: 1.0)
- is_multi_class: Specifies whether to use binary-class SVM or multi-class SVM algorithm (default: False)
- """
- self.sqlCtx = sqlCtx
- self.sc = sqlCtx._sc
- self.uid = "svm"
- self.estimator = self.sc._jvm.org.apache.sysml.api.ml.SVM(self.uid, self.sc._jsc.sc(), is_multi_class)
- self.estimator.setMaxIter(max_iter)
- if C <= 0:
- raise Exception('C has to be positive')
- reg = 1.0 / C
- self.estimator.setRegParam(reg)
- self.estimator.setTol(tol)
- self.estimator.setIcpt(int(fit_intercept))
- self.transferUsingDF = transferUsingDF
- self.setOutputRawPredictionsToFalse = False
-
-class NaiveBayes(BaseSystemMLClassifier):
-
- def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False):
- """
- Performs both binary-class and multiclass SVM (Support Vector Machines).
-
- Parameters
- ----------
- sqlCtx: PySpark SQLContext
- laplace: Laplace smoothing specified by the user to avoid creation of 0 probabilities (default: 1.0)
- """
- self.sqlCtx = sqlCtx
- self.sc = sqlCtx._sc
- self.uid = "nb"
- self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc())
- self.estimator.setLaplace(laplace)
- self.transferUsingDF = transferUsingDF
- self.setOutputRawPredictionsToFalse = False
-
-__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/setup.py
----------------------------------------------------------------------
diff --git a/src/main/python/setup.py b/src/main/python/setup.py
index 0bcebab..cc8f373 100644
--- a/src/main/python/setup.py
+++ b/src/main/python/setup.py
@@ -34,7 +34,7 @@ REQUIRED_PACKAGES = [
]
PACKAGE_DATA = []
-for path, subdirs, files in os.walk('SystemML/SystemML-java'):
+for path, subdirs, files in os.walk('systemml/systemml-java'):
for name in files:
PACKAGE_DATA = PACKAGE_DATA + [ os.path.join(path, name).replace('./', '') ]
@@ -61,7 +61,7 @@ setup(
install_requires=REQUIRED_PACKAGES,
include_package_data=True,
package_data={
- 'SystemML-java': PACKAGE_DATA
+ 'systemml-java': PACKAGE_DATA
},
classifiers=[
'Intended Audience :: Developers',
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/__init__.py b/src/main/python/systemml/__init__.py
new file mode 100644
index 0000000..02a940b
--- /dev/null
+++ b/src/main/python/systemml/__init__.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from .mlcontext import *
+from .defmatrix import *
+from .converters import *
+
+__all__ = mlcontext.__all__
+__all__ += defmatrix.__all__
+__all__ += converters.__all__
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py
new file mode 100644
index 0000000..9588bec
--- /dev/null
+++ b/src/main/python/systemml/converters.py
@@ -0,0 +1,100 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from pyspark.context import SparkContext
+from pyspark.sql import DataFrame, SQLContext
+from pyspark.rdd import RDD
+import numpy as np
+import pandas as pd
+import sklearn as sk
+
+from scipy.sparse import spmatrix
+from scipy.sparse import coo_matrix
+
+SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix)
+
+def getNumCols(numPyArr):
+ if numPyArr.ndim == 1:
+ return 1
+ else:
+ return numPyArr.shape[1]
+
+def convertToLabeledDF(sqlCtx, X, y=None):
+ from pyspark.ml.feature import VectorAssembler
+ if y is not None:
+ pd1 = pd.DataFrame(X)
+ pd2 = pd.DataFrame(y, columns=['label'])
+ pdf = pd.concat([pd1, pd2], axis=1)
+ inputColumns = ['C' + str(i) for i in pd1.columns]
+ outputColumns = inputColumns + ['label']
+ else:
+ pdf = pd.DataFrame(X)
+ inputColumns = ['C' + str(i) for i in pdf.columns]
+ outputColumns = inputColumns
+ assembler = VectorAssembler(inputCols=inputColumns, outputCol='features')
+ out = assembler.transform(sqlCtx.createDataFrame(pdf, outputColumns))
+ if y is not None:
+ return out.select('features', 'label')
+ else:
+ return out.select('features')
+
+
+def convertToMatrixBlock(sc, src):
+ if isinstance(src, spmatrix):
+ src = coo_matrix(src, dtype=np.float64)
+ numRows = src.shape[0]
+ numCols = src.shape[1]
+ data = src.data
+ row = src.row.astype(np.int32)
+ col = src.col.astype(np.int32)
+ nnz = len(src.col)
+ buf1 = bytearray(data.tostring())
+ buf2 = bytearray(row.tostring())
+ buf3 = bytearray(col.tostring())
+ return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz)
+ elif isinstance(sc, SparkContext):
+ src = np.asarray(src)
+ numCols = getNumCols(src)
+ numRows = src.shape[0]
+ arr = src.ravel().astype(np.float64)
+ buf = bytearray(arr.tostring())
+ return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols)
+ else:
+ raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
+
+
+def convertToNumpyArr(sc, mb):
+ if isinstance(sc, SparkContext):
+ numRows = mb.getNumRows()
+ numCols = mb.getNumColumns()
+ buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb)
+ return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64).reshape((numRows, numCols))
+ else:
+ raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
+
+
+def convertToPandasDF(X):
+ if not isinstance(X, pd.DataFrame):
+ return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))])
+ return X
+
+__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
new file mode 100644
index 0000000..18f6314
--- /dev/null
+++ b/src/main/python/systemml/defmatrix.py
@@ -0,0 +1,410 @@
+#!/usr/bin/python
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import numpy as np
+
+from . import pydml, MLContext
+from .converters import *
+from pyspark import SparkContext, RDD
+from pyspark.sql import DataFrame, SQLContext
+
+def setSparkContext(sc):
+ """
+ Before using the matrix, the user needs to invoke this function.
+
+ Parameters
+ ----------
+ sc: SparkContext
+ SparkContext
+ """
+ matrix.ml = MLContext(sc)
+ matrix.sc = sc
+
+def checkIfMLContextIsSet():
+ if matrix.ml is None:
+ raise Exception('Expected setSparkContext(sc) to be called.')
+
+class DMLOp(object):
+ """
+ Represents an intermediate node of Abstract syntax tree created to generate the PyDML script
+ """
+ def __init__(self, inputs, dml=None):
+ self.inputs = inputs
+ self.dml = dml
+
+ def _visit(self, execute=True):
+ matrix.dml = matrix.dml + self.dml
+
+
+def reset():
+ """
+ Resets the visited status of matrix and the operators in the generated AST.
+ """
+ for m in matrix.visited:
+ m.visited = False
+ matrix.visited = []
+
+def binaryOp(lhs, rhs, opStr):
+ """
+ Common function called by all the binary operators in matrix class
+ """
+ inputs = []
+ if isinstance(lhs, matrix):
+ lhsStr = lhs.ID
+ inputs = [lhs]
+ elif isinstance(lhs, float) or isinstance(lhs, int):
+ lhsStr = str(lhs)
+ else:
+ raise TypeError('Incorrect type')
+ if isinstance(rhs, matrix):
+ rhsStr = rhs.ID
+ inputs = inputs + [rhs]
+ elif isinstance(rhs, float) or isinstance(rhs, int):
+ rhsStr = str(rhs)
+ else:
+ raise TypeError('Incorrect type')
+ dmlOp = DMLOp(inputs)
+ out = matrix(None, op=dmlOp)
+ dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n']
+ return out
+
+def binaryMatrixFunction(X, Y, fnName):
+ """
+ Common function called by supported PyDML built-in function that has two arguments both of which are matrices.
+ TODO: This needs to be generalized to support arbitrary arguments of differen types.
+ """
+ if not isinstance(X, matrix) or not isinstance(Y, matrix):
+ raise TypeError('Incorrect input type. Expected matrix type')
+ inputs = [X, Y]
+ dmlOp = DMLOp(inputs)
+ out = matrix(None, op=dmlOp)
+ dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n']
+ return out
+
+def solve(A, b):
+ """
+ Computes the least squares solution for system of linear equations A %*% x = b
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> from sklearn import datasets
+ >>> import SystemML as sml
+ >>> from pyspark.sql import SQLContext
+ >>> diabetes = datasets.load_diabetes()
+ >>> diabetes_X = diabetes.data[:, np.newaxis, 2]
+ >>> X_train = diabetes_X[:-20]
+ >>> X_test = diabetes_X[-20:]
+ >>> y_train = diabetes.target[:-20]
+ >>> y_test = diabetes.target[-20:]
+ >>> sml.setSparkContext(sc)
+ >>> X = sml.matrix(X_train)
+ >>> y = sml.matrix(y_train)
+ >>> A = X.transpose().dot(X)
+ >>> b = X.transpose().dot(y)
+ >>> beta = sml.solve(A, b).toNumPyArray()
+ >>> y_predicted = X_test.dot(beta)
+ >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
+ Residual sum of squares: 25282.12
+ """
+ return binaryMatrixFunction(A, b, 'solve')
+
+def eval(outputs, outputDF=False, execute=True):
+ """
+ Executes the unevaluated DML script and computes the matrices specified by outputs.
+
+ Parameters
+ ----------
+ outputs: list of matrices
+ outputDF: back the data of matrix as PySpark DataFrame
+ """
+ checkIfMLContextIsSet()
+ reset()
+ matrix.dml = []
+ matrix.script = pydml('')
+ if isinstance(outputs, matrix):
+ outputs = [ outputs ]
+ elif not isinstance(outputs, list):
+ raise TypeError('Incorrect input type')
+ for m in outputs:
+ m.output = True
+ m._visit(execute=execute)
+ if not execute:
+ return ''.join(matrix.dml)
+ matrix.script.scriptString = ''.join(matrix.dml)
+ results = matrix.ml.execute(matrix.script)
+ # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
+ for m in outputs:
+ if outputDF:
+ m.data = results.getDataFrame(m.ID)
+ else:
+ m.data = results.getNumPyArray(m.ID)
+
+class matrix(object):
+ """
+ matrix class is a python wrapper that implements basic matrix operator.
+ Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
+
+ Examples
+ --------
+ >>> import SystemML as sml
+ >>> import numpy as np
+ >>> sml.setSparkContext(sc)
+
+ Welcome to Apache SystemML!
+
+ >>> m1 = sml.matrix(np.ones((3,3)) + 2)
+ >>> m2 = sml.matrix(np.ones((3,3)) + 3)
+ >>> m2 = m1 * (m2 + m1)
+ >>> m4 = 1.0 - m2
+ >>> m4
+ # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
+ mVar1 = load(" ", format="csv")
+ mVar2 = load(" ", format="csv")
+ mVar3 = mVar2 + mVar1
+ mVar4 = mVar1 * mVar3
+ mVar5 = 1.0 - mVar4
+ save(mVar5, " ")
+
+ <SystemML.defmatrix.matrix object>
+ >>> m2.eval()
+ >>> m2
+ # This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.
+ <SystemML.defmatrix.matrix object>
+ >>> m4
+ # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.
+ mVar4 = load(" ", format="csv")
+ mVar5 = 1.0 - mVar4
+ save(mVar5, " ")
+
+ <SystemML.defmatrix.matrix object>
+ >>> m4.sum(axis=1).toNumPyArray()
+ array([[-60.],
+ [-60.],
+ [-60.]])
+ """
+ # Global variable that is used to keep track of intermediate matrix variables in the DML script
+ systemmlVarID = 0
+
+ # Since joining of string is expensive operation, we collect the set of strings into list and then join
+ # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method
+ dml = []
+
+ # Represents MLContext's script object
+ script = None
+
+ # Represents MLContext object
+ ml = None
+
+ # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects
+ # that have been previously evaluated.
+ visited = []
+
+ def __init__(self, data, op=None):
+ """
+ Constructs a lazy matrix
+
+ Parameters
+ ----------
+ data: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame. (data cannot be None for external users, 'data=None' is used internally for lazy evaluation).
+ """
+ checkIfMLContextIsSet()
+ self.visited = False
+ matrix.systemmlVarID += 1
+ self.output = False
+ self.ID = 'mVar' + str(matrix.systemmlVarID)
+ if isinstance(data, SUPPORTED_TYPES):
+ self.data = data
+ elif hasattr(data, '_jdf'):
+ self.data = data
+ elif data is None and op is not None:
+ self.data = None
+ # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation
+ self.op = op
+ else:
+ raise TypeError('Unsupported input type')
+
+ def eval(self, outputDF=False):
+ """
+ This is a convenience function that calls the global eval method
+ """
+ eval([self], outputDF=False)
+
+ def toPandas(self):
+ """
+ This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame.
+ """
+ if self.data is None:
+ self.eval()
+ return convertToPandasDF(self.data)
+
+ def toNumPyArray(self):
+ """
+ This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array.
+ """
+ if self.data is None:
+ self.eval()
+ if isinstance(self.data, DataFrame):
+ self.data = self.data.toPandas().as_matrix()
+ # Always keep default format as NumPy array if possible
+ return self.data
+
+ def toDataFrame(self):
+ """
+ This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame.
+ """
+ if self.data is None:
+ self.eval(outputDF=True)
+ if not isinstance(self.data, DataFrame):
+ if MLResults.sqlContext is None:
+ MLResults.sqlContext = SQLContext(matrix.sc)
+ self.data = sqlContext.createDataFrame(self.toPandas())
+ return self.data
+
+ def _visit(self, execute=True):
+ """
+ This function is called for two scenarios:
+ 1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method.
+ 2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself
+ and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext.
+ """
+ if self.visited:
+ return self
+ self.visited = True
+ # for cleanup
+ matrix.visited = matrix.visited + [ self ]
+ if self.data is not None:
+ matrix.dml = matrix.dml + [ self.ID, ' = load(\" \", format=\"csv\")\n']
+ if isinstance(self.data, DataFrame) and execute:
+ matrix.script.input(self.ID, self.data)
+ elif execute:
+ matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data))
+ return self
+ elif self.op is not None:
+ for m in self.op.inputs:
+ m._visit(execute=execute)
+ self.op._visit(execute=execute)
+ else:
+ raise Exception('Expected either op or data to be set')
+ if self.data is None and self.output:
+ matrix.dml = matrix.dml + ['save(', self.ID, ', \" \")\n']
+ if execute:
+ matrix.script.output(self.ID)
+ return self
+
+ def __repr__(self):
+ """
+ This function helps to debug matrix class and also examine the generated PyDML script
+ """
+ if self.data is None:
+ print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False))
+ elif isinstance(self.data, DataFrame):
+ print('# This matrix (' + self.ID + ') is backed by PySpark DataFrame. To fetch the DataFrame, invoke toDataFrame() method.')
+ else:
+ print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.')
+ return '<SystemML.defmatrix.matrix object>'
+
+ def __add__(self, other):
+ return binaryOp(self, other, ' + ')
+
+ def __sub__(self, other):
+ return binaryOp(self, other, ' - ')
+
+ def __mul__(self, other):
+ return binaryOp(self, other, ' * ')
+
+ def __floordiv__(self, other):
+ return binaryOp(self, other, ' // ')
+
+ def __div__(self, other):
+ return binaryOp(self, other, ' / ')
+
+ def __mod__(self, other):
+ return binaryOp(self, other, ' % ')
+
+ def __pow__(self, other):
+ return binaryOp(self, other, ' ** ')
+
+ def __radd__(self, other):
+ return binaryOp(other, self, ' + ')
+
+ def __rsub__(self, other):
+ return binaryOp(other, self, ' - ')
+
+ def __rmul__(self, other):
+ return binaryOp(other, self, ' * ')
+
+ def __rfloordiv__(self, other):
+ return binaryOp(other, self, ' // ')
+
+ def __rdiv__(self, other):
+ return binaryOp(other, self, ' / ')
+
+ def __rmod__(self, other):
+ return binaryOp(other, self, ' % ')
+
+ def __rpow__(self, other):
+ return binaryOp(other, self, ' ** ')
+
+ def sum(self, axis=None):
+ return self._aggFn('sum', axis)
+
+ def mean(self, axis=None):
+ return self._aggFn('mean', axis)
+
+ def max(self, axis=None):
+ return self._aggFn('max', axis)
+
+ def min(self, axis=None):
+ return self._aggFn('min', axis)
+
+ def argmin(self, axis=None):
+ return self._aggFn('argmin', axis)
+
+ def argmax(self, axis=None):
+ return self._aggFn('argmax', axis)
+
+ def cumsum(self, axis=None):
+ return self._aggFn('cumsum', axis)
+
+ def transpose(self, axis=None):
+ return self._aggFn('transpose', axis)
+
+ def trace(self, axis=None):
+ return self._aggFn('trace', axis)
+
+ def _aggFn(self, fnName, axis):
+ """
+ Common function that is called for functions that have axis as parameter.
+ """
+ dmlOp = DMLOp([self])
+ out = matrix(None, op=dmlOp)
+ if axis is None:
+ dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ')\n']
+ else:
+ dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n']
+ return out
+
+ def dot(self, other):
+ return binaryMatrixFunction(self, other, 'dot')
+
+__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']