You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/09/14 18:34:05 UTC
incubator-systemml git commit: [SYSTEMML-879] Remove the Python
MLResults.getDataFrame & MLResults.getNumpyArray functions.
Repository: incubator-systemml
Updated Branches:
refs/heads/master f4b50cdb1 -> bcf431331
[SYSTEMML-879] Remove the Python MLResults.getDataFrame & MLResults.getNumpyArray functions.
* Remove the `MLResults.getDataFrame` function and replace usages of `getDataFrame("matrixName")` with `get("matrixName").toDF()`.
* Remove the `MLResults.getNumpyArray` function and replace usages of `getNumpyArray("matrixName")` with `get("matrixName").toNumPy()`.
Closes #239.
Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bcf43133
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bcf43133
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bcf43133
Branch: refs/heads/master
Commit: bcf431331b38e7204437cc217487810d6fd06aac
Parents: f4b50cd
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Sep 14 11:27:57 2016 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Sep 14 11:27:57 2016 -0700
----------------------------------------------------------------------
docs/beginners-guide-python.md | 2 +-
src/main/python/systemml/converters.py | 4 +-
src/main/python/systemml/defmatrix.py | 13 +++---
src/main/python/systemml/mlcontext.py | 44 ++++++---------------
src/main/python/systemml/mllearn/estimators.py | 2 +-
src/main/python/tests/test_mlcontext.py | 19 ++++++---
6 files changed, 37 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/docs/beginners-guide-python.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md
index f040212..725363a 100644
--- a/docs/beginners-guide-python.md
+++ b/docs/beginners-guide-python.md
@@ -354,5 +354,5 @@ y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:.9 * n_samples]))
ml = sml.MLContext(sc)
script = os.path.join(os.environ['SYSTEMML_HOME'], 'scripts', 'algorithms', 'MultiLogReg.dml')
script = sml.dml(script).input(X=X_df, Y_vec=y_df).output("B_out")
-beta = ml.execute(script).getNumPyArray('B_out')
+beta = ml.execute(script).get('B_out').toNumPy()
```
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py
index 243a507..044fcfa 100644
--- a/src/main/python/systemml/converters.py
+++ b/src/main/python/systemml/converters.py
@@ -19,7 +19,7 @@
#
#-------------------------------------------------------------
-__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
+__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumPyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
import numpy as np
import pandas as pd
@@ -79,7 +79,7 @@ def convertToMatrixBlock(sc, src):
raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
-def convertToNumpyArr(sc, mb):
+def convertToNumPyArr(sc, mb):
if isinstance(sc, SparkContext):
numRows = mb.getNumRows()
numCols = mb.getNumColumns()
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
index 4d2781a..50abc0a 100644
--- a/src/main/python/systemml/defmatrix.py
+++ b/src/main/python/systemml/defmatrix.py
@@ -39,6 +39,7 @@ def setSparkContext(sc):
SparkContext
"""
matrix.sc = sc
+ matrix.sqlContext = SQLContext(sc)
matrix.ml = MLContext(matrix.sc)
def checkIfMLContextIsSet():
@@ -205,9 +206,9 @@ def populateOutputs(outputs, results, outputDF):
"""
for m in outputs:
if outputDF:
- m.data = results.getDataFrame(m.ID)
+ m.data = results.get(m.ID).toDF()
else:
- m.data = results.getNumPyArray(m.ID)
+ m.data = results.get(m.ID).toNumPy()
###############################################################################
@@ -279,7 +280,7 @@ def solve(A, b):
>>> y = sml.matrix(y_train)
>>> A = X.transpose().dot(X)
>>> b = X.transpose().dot(y)
- >>> beta = sml.solve(A, b).toNumPyArray()
+ >>> beta = sml.solve(A, b).toNumPy()
>>> y_predicted = X_test.dot(beta)
>>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
Residual sum of squares: 25282.12
@@ -378,7 +379,7 @@ class matrix(object):
save(mVar5, " ")
<SystemML.defmatrix.matrix object>
- >>> m4.sum(axis=1).toNumPyArray()
+ >>> m4.sum(axis=1).toNumPy()
array([[-60.],
[-60.],
[-60.]])
@@ -452,9 +453,7 @@ class matrix(object):
if self.data is None:
self.eval(outputDF=True)
if not isinstance(self.data, DataFrame):
- if MLResults.sqlContext is None:
- MLResults.sqlContext = SQLContext(matrix.sc)
- self.data = sqlContext.createDataFrame(self.toPandas())
+ self.data = matrix.sqlContext.createDataFrame(self.toPandas())
return self.data
def _markAsVisited(self):
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mlcontext.py b/src/main/python/systemml/mlcontext.py
index ae56a46..b6cb799 100644
--- a/src/main/python/systemml/mlcontext.py
+++ b/src/main/python/systemml/mlcontext.py
@@ -31,7 +31,6 @@ except ImportError:
from pyspark import SparkContext
import pyspark.mllib.common
-from pyspark.sql import DataFrame, SQLContext
from .converters import *
@@ -118,7 +117,7 @@ class Matrix(object):
Returns
-------
- df: PySpark SQL DataFrame
+ PySpark SQL DataFrame
A PySpark SQL DataFrame representing the matrix, with
one "__INDEX" column containing the row index (since Spark
DataFrames are unordered), followed by columns of doubles
@@ -128,6 +127,18 @@ class Matrix(object):
df = _java2py(self.sc, jdf)
return df
+ def toNumPy(self):
+ """
+ Convert the Matrix to a NumPy Array.
+
+ Returns
+ -------
+ NumPy Array
+ A NumPy Array representing the Matrix object.
+ """
+ np_array = convertToNumPyArr(self.sc, self._java_matrix.toBinaryBlockMatrix().getMatrixBlock())
+ return np_array
+
class MLResults(object):
"""
@@ -144,39 +155,10 @@ class MLResults(object):
def __init__(self, results, sc):
self._java_results = results
self.sc = sc
- try:
- if MLResults.sqlContext is None:
- MLResults.sqlContext = SQLContext(sc)
- except AttributeError:
- MLResults.sqlContext = SQLContext(sc)
def __repr__(self):
return "MLResults"
- def getNumPyArray(self, *outputs):
- """
- Parameters
- ----------
- outputs: string, list of strings
- Output variables as defined inside the DML script.
- """
- outs = [convertToNumpyArr(self.sc, self._java_results.getMatrix(out).toBinaryBlockMatrix().getMatrixBlock()) for out in outputs]
- if len(outs) == 1:
- return outs[0]
- return outs
-
- def getDataFrame(self, *outputs):
- """
- Parameters
- ----------
- outputs: string, list of strings
- Output variables as defined inside the DML script.
- """
- outs = [DataFrame(self._java_results.getDataFrame(out), MLResults.sqlContext) for out in outputs]
- if len(outs) == 1:
- return outs[0]
- return outs
-
def get(self, *outputs):
"""
Parameters
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
index ceead4d..82e0b2c 100644
--- a/src/main/python/systemml/mllearn/estimators.py
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -131,7 +131,7 @@ class BaseSystemMLEstimator(Estimator):
else:
return retPDF
else:
- retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
+ retNumPy = convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
if isinstance(X, np.ndarray):
return retNumPy
else:
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/tests/test_mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mlcontext.py b/src/main/python/tests/test_mlcontext.py
index b9ecb00..353771f 100644
--- a/src/main/python/tests/test_mlcontext.py
+++ b/src/main/python/tests/test_mlcontext.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
@@ -23,6 +22,8 @@ import unittest
from pyspark.context import SparkContext
+import numpy as np
+
from systemml import MLContext, dml, pydml
sc = SparkContext()
@@ -64,6 +65,15 @@ class TestAPI(unittest.TestCase):
m2 = ml.execute(script).get("m2")
self.assertEqual(repr(m2.toDF()), "DataFrame[__INDEX: double, C1: double, C2: double]")
+ def test_matrix_toNumPy(self):
+ script = """
+ m2 = m1 * 2
+ """
+ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"])
+ script = dml(script).input(m1=rdd1).output("m2")
+ m2 = ml.execute(script).get("m2")
+ self.assertTrue((m2.toNumPy() == np.array([[2.0, 4.0], [6.0, 8.0]])).all())
+
def test_input_single(self):
script = """
x2 = x1 + 1
@@ -88,15 +98,14 @@ class TestAPI(unittest.TestCase):
rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"])
rdd2 = sc.parallelize(["5.0,6.0", "7.0,8.0"])
script = dml(sums).input(m1=rdd1).input(m2=rdd2).output("s1", "s2", "s3")
- self.assertEqual(
- ml.execute(script).get("s1", "s2", "s3"), [10.0, 26.0, "whatever"])
+ self.assertEqual(ml.execute(script).get("s1", "s2", "s3"), [10.0, 26.0, "whatever"])
def test_pydml(self):
script = "A = full('1 2 3 4 5 6 7 8 9', rows=3, cols=3)\nx = toString(A)"
script = pydml(script).output("x")
self.assertEqual(
- ml.execute(script).get("x"),
- '1.000 2.000 3.000\n4.000 5.000 6.000\n7.000 8.000 9.000\n'
+ ml.execute(script).get("x"),
+ '1.000 2.000 3.000\n4.000 5.000 6.000\n7.000 8.000 9.000\n'
)