You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/09/14 18:34:05 UTC

incubator-systemml git commit: [SYSTEMML-879] Remove the Python MLResults.getDataFrame & MLResults.getNumpyArray functions.

Repository: incubator-systemml
Updated Branches:
  refs/heads/master f4b50cdb1 -> bcf431331


[SYSTEMML-879] Remove the Python MLResults.getDataFrame & MLResults.getNumpyArray functions.

* Remove the `MLResults.getDataFrame` function and replace usages of `getDataFrame("matrixName")` with `get("matrixName").toDF()`.
* Remove the `MLResults.getNumpyArray` function and replace usages of `getNumpyArray("matrixName")` with `get("matrixName").toNumPy()`.

Closes #239.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bcf43133
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bcf43133
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bcf43133

Branch: refs/heads/master
Commit: bcf431331b38e7204437cc217487810d6fd06aac
Parents: f4b50cd
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Sep 14 11:27:57 2016 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Sep 14 11:27:57 2016 -0700

----------------------------------------------------------------------
 docs/beginners-guide-python.md                 |  2 +-
 src/main/python/systemml/converters.py         |  4 +-
 src/main/python/systemml/defmatrix.py          | 13 +++---
 src/main/python/systemml/mlcontext.py          | 44 ++++++---------------
 src/main/python/systemml/mllearn/estimators.py |  2 +-
 src/main/python/tests/test_mlcontext.py        | 19 ++++++---
 6 files changed, 37 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/docs/beginners-guide-python.md
----------------------------------------------------------------------
diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md
index f040212..725363a 100644
--- a/docs/beginners-guide-python.md
+++ b/docs/beginners-guide-python.md
@@ -354,5 +354,5 @@ y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:.9 * n_samples]))
 ml = sml.MLContext(sc)
 script = os.path.join(os.environ['SYSTEMML_HOME'], 'scripts', 'algorithms', 'MultiLogReg.dml')
 script = sml.dml(script).input(X=X_df, Y_vec=y_df).output("B_out")
-beta = ml.execute(script).getNumPyArray('B_out')
+beta = ml.execute(script).get('B_out').toNumPy()
 ```

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/converters.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py
index 243a507..044fcfa 100644
--- a/src/main/python/systemml/converters.py
+++ b/src/main/python/systemml/converters.py
@@ -19,7 +19,7 @@
 #
 #-------------------------------------------------------------
 
-__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
+__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumPyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF']
 
 import numpy as np
 import pandas as pd
@@ -79,7 +79,7 @@ def convertToMatrixBlock(sc, src):
         raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves
 
 
-def convertToNumpyArr(sc, mb):
+def convertToNumPyArr(sc, mb):
     if isinstance(sc, SparkContext):
         numRows = mb.getNumRows()
         numCols = mb.getNumColumns()

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
index 4d2781a..50abc0a 100644
--- a/src/main/python/systemml/defmatrix.py
+++ b/src/main/python/systemml/defmatrix.py
@@ -39,6 +39,7 @@ def setSparkContext(sc):
         SparkContext
     """
     matrix.sc = sc
+    matrix.sqlContext = SQLContext(sc)
     matrix.ml = MLContext(matrix.sc)
 
 def checkIfMLContextIsSet():
@@ -205,9 +206,9 @@ def populateOutputs(outputs, results, outputDF):
     """
     for m in outputs:
         if outputDF:
-            m.data = results.getDataFrame(m.ID)
+            m.data = results.get(m.ID).toDF()
         else:
-            m.data = results.getNumPyArray(m.ID)
+            m.data = results.get(m.ID).toNumPy()
 
 ###############################################################################
 
@@ -279,7 +280,7 @@ def solve(A, b):
     >>> y = sml.matrix(y_train)
     >>> A = X.transpose().dot(X)
     >>> b = X.transpose().dot(y)
-    >>> beta = sml.solve(A, b).toNumPyArray()
+    >>> beta = sml.solve(A, b).toNumPy()
     >>> y_predicted = X_test.dot(beta)
     >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2))
     Residual sum of squares: 25282.12
@@ -378,7 +379,7 @@ class matrix(object):
     save(mVar5, " ")
 
     <SystemML.defmatrix.matrix object>
-    >>> m4.sum(axis=1).toNumPyArray()
+    >>> m4.sum(axis=1).toNumPy()
     array([[-60.],
            [-60.],
            [-60.]])
@@ -452,9 +453,7 @@ class matrix(object):
         if self.data is None:
             self.eval(outputDF=True)
         if not isinstance(self.data, DataFrame):
-            if MLResults.sqlContext is None:
-                MLResults.sqlContext = SQLContext(matrix.sc)
-            self.data = sqlContext.createDataFrame(self.toPandas())
+            self.data = matrix.sqlContext.createDataFrame(self.toPandas())
         return self.data
 
     def _markAsVisited(self):

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mlcontext.py b/src/main/python/systemml/mlcontext.py
index ae56a46..b6cb799 100644
--- a/src/main/python/systemml/mlcontext.py
+++ b/src/main/python/systemml/mlcontext.py
@@ -31,7 +31,6 @@ except ImportError:
 
 from pyspark import SparkContext
 import pyspark.mllib.common
-from pyspark.sql import DataFrame, SQLContext
 
 from .converters import *
 
@@ -118,7 +117,7 @@ class Matrix(object):
 
         Returns
         -------
-        df: PySpark SQL DataFrame
+        PySpark SQL DataFrame
             A PySpark SQL DataFrame representing the matrix, with
             one "__INDEX" column containing the row index (since Spark
             DataFrames are unordered), followed by columns of doubles
@@ -128,6 +127,18 @@ class Matrix(object):
         df = _java2py(self.sc, jdf)
         return df
 
+    def toNumPy(self):
+        """
+        Convert the Matrix to a NumPy Array.
+
+        Returns
+        -------
+        NumPy Array
+            A NumPy Array representing the Matrix object.
+        """
+        np_array = convertToNumPyArr(self.sc, self._java_matrix.toBinaryBlockMatrix().getMatrixBlock())
+        return np_array
+
 
 class MLResults(object):
     """
@@ -144,39 +155,10 @@ class MLResults(object):
     def __init__(self, results, sc):
         self._java_results = results
         self.sc = sc
-        try:
-            if MLResults.sqlContext is None:
-                MLResults.sqlContext = SQLContext(sc)
-        except AttributeError:
-            MLResults.sqlContext = SQLContext(sc)
 
     def __repr__(self):
         return "MLResults"
 
-    def getNumPyArray(self, *outputs):
-        """
-        Parameters
-        ----------
-        outputs: string, list of strings
-            Output variables as defined inside the DML script.
-        """
-        outs = [convertToNumpyArr(self.sc, self._java_results.getMatrix(out).toBinaryBlockMatrix().getMatrixBlock()) for out in outputs]
-        if len(outs) == 1:
-            return outs[0]
-        return outs
-
-    def getDataFrame(self, *outputs):
-        """
-        Parameters
-        ----------
-        outputs: string, list of strings
-            Output variables as defined inside the DML script.
-        """
-        outs = [DataFrame(self._java_results.getDataFrame(out), MLResults.sqlContext) for out in outputs]
-        if len(outs) == 1:
-            return outs[0]
-        return outs
-
     def get(self, *outputs):
         """
         Parameters

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/systemml/mllearn/estimators.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py
index ceead4d..82e0b2c 100644
--- a/src/main/python/systemml/mllearn/estimators.py
+++ b/src/main/python/systemml/mllearn/estimators.py
@@ -131,7 +131,7 @@ class BaseSystemMLEstimator(Estimator):
                 else:
                     return retPDF
             else:
-                retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
+                retNumPy = convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
                 if isinstance(X, np.ndarray):
                     return retNumPy
                 else:

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bcf43133/src/main/python/tests/test_mlcontext.py
----------------------------------------------------------------------
diff --git a/src/main/python/tests/test_mlcontext.py b/src/main/python/tests/test_mlcontext.py
index b9ecb00..353771f 100644
--- a/src/main/python/tests/test_mlcontext.py
+++ b/src/main/python/tests/test_mlcontext.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -23,6 +22,8 @@ import unittest
 
 from pyspark.context import SparkContext
 
+import numpy as np
+
 from systemml import MLContext, dml, pydml
 
 sc = SparkContext()
@@ -64,6 +65,15 @@ class TestAPI(unittest.TestCase):
         m2 = ml.execute(script).get("m2")
         self.assertEqual(repr(m2.toDF()), "DataFrame[__INDEX: double, C1: double, C2: double]")
 
+    def test_matrix_toNumPy(self):
+        script = """
+        m2 = m1 * 2
+        """
+        rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"])
+        script = dml(script).input(m1=rdd1).output("m2")
+        m2 = ml.execute(script).get("m2")
+        self.assertTrue((m2.toNumPy() == np.array([[2.0, 4.0], [6.0, 8.0]])).all())
+
     def test_input_single(self):
         script = """
         x2 = x1 + 1
@@ -88,15 +98,14 @@ class TestAPI(unittest.TestCase):
         rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"])
         rdd2 = sc.parallelize(["5.0,6.0", "7.0,8.0"])
         script = dml(sums).input(m1=rdd1).input(m2=rdd2).output("s1", "s2", "s3")
-        self.assertEqual(
-            ml.execute(script).get("s1", "s2", "s3"), [10.0, 26.0, "whatever"])
+        self.assertEqual(ml.execute(script).get("s1", "s2", "s3"), [10.0, 26.0, "whatever"])
 
     def test_pydml(self):
         script = "A = full('1 2 3 4 5 6 7 8 9', rows=3, cols=3)\nx = toString(A)"
         script = pydml(script).output("x")
         self.assertEqual(
-            ml.execute(script).get("x"),
-            '1.000 2.000 3.000\n4.000 5.000 6.000\n7.000 8.000 9.000\n'
+                ml.execute(script).get("x"),
+                '1.000 2.000 3.000\n4.000 5.000 6.000\n7.000 8.000 9.000\n'
         )