You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/08/25 00:20:21 UTC

incubator-systemml git commit: [SYSTEMML-865] Add Matrix Wrapper to Python MLContext

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 827c4becd -> 2f3ab9880


[SYSTEMML-865] Add Matrix Wrapper to Python MLContext

This exposes the Matrix type that is present in the new Java MLContext to the new Python MLContext by allowing the Java side to return all of the correct types via a simple call to get() on a MLResults object, including returning the existing Matrix type. On the Java side, the new Matrix type can already be used for future SystemML scripts, and contains methods for converting to other types, such as a Spark DataFrame. On the Python side, this simply creates a wrapper around this Java Matrix object. Then, the user can either use this Matrix object in a future SystemML script, or can convert it to a PySpark DataFrame with toDF().

Closes #217.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/2f3ab988
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/2f3ab988
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/2f3ab988

Branch: refs/heads/master
Commit: 2f3ab98804dace8b1ac1fef70d841c8df88290ab
Parents: 827c4be
Author: Mike Dusenberry <mw...@us.ibm.com>
Authored: Wed Aug 24 17:14:11 2016 -0700
Committer: Mike Dusenberry <mw...@us.ibm.com>
Committed: Wed Aug 24 17:14:12 2016 -0700

----------------------------------------------------------------------
 .../apache/sysml/api/mlcontext/MLResults.java   |  14 +-
 src/main/python/SystemML.py                     | 155 +++++++++++++------
 src/main/python/SystemMLtests.py                |  25 ++-
 3 files changed, 134 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2f3ab988/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java b/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java
index 289f490..31798e0 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLResults.java
@@ -340,12 +340,14 @@ public class MLResults {
 
 	public Object get(String outputName) {
 		Data data = getData(outputName);
-	  if (data instanceof ScalarObject) {
-	  	ScalarObject so = (ScalarObject) data;
-	    	return so.getValue();
-	  } else {
-	      return data;
-	  }
+		if (data instanceof ScalarObject) {
+			ScalarObject so = (ScalarObject) data;
+			return so.getValue();
+		} else if(data instanceof MatrixObject) {
+			return getMatrix(outputName);
+		} else {
+			return data;
+		}
 	}
 
 	/**

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2f3ab988/src/main/python/SystemML.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemML.py b/src/main/python/SystemML.py
index 85731ed..7142a9d 100644
--- a/src/main/python/SystemML.py
+++ b/src/main/python/SystemML.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
@@ -22,21 +21,112 @@
 import os
 
 from py4j.java_gateway import JavaObject
-from py4j.java_collections import ListConverter, JavaArray, JavaList
-from pyspark import SparkContext, RDD
-from pyspark.mllib.common import _java2py, _py2java
-from pyspark.serializers import PickleSerializer, AutoBatchedSerializer
-from pyspark.sql import DataFrame
+from pyspark import SparkContext
+import pyspark.mllib.common
+
+
+def dml(scriptString):
+    """
+    Create a dml script object based on a string.
+
+    Parameters
+    ----------
+    scriptString: string
+        Can be a path to a dml script or a dml script itself.
+
+    Returns
+    -------
+    script: Script instance
+        Instance of a script object.
+    """
+    if not isinstance(scriptString, str):
+        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
+    return Script(scriptString, scriptType="dml")
+
+
+def pydml(scriptString):
+    """
+    Create a pydml script object based on a string.
+
+    Parameters
+    ----------
+    scriptString: string
+        Can be a path to a pydml script or a pydml script itself.
+
+    Returns
+    -------
+    script: Script instance
+        Instance of a script object.
+    """
+    if not isinstance(scriptString, str):
+        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
+    return Script(scriptString, scriptType="pydml")
+
+
+def _java2py(sc, obj):
+    """ Convert Java object to Python. """
+    # TODO: Port this private PySpark function.
+    obj = pyspark.mllib.common._java2py(sc, obj)
+    if isinstance(obj, JavaObject):
+        class_name = obj.getClass().getSimpleName()
+        if class_name == 'Matrix':
+            obj = Matrix(obj, sc)
+    return obj
+
+
+def _py2java(sc, obj):
+    """ Convert Python object to Java. """
+    if isinstance(obj, Matrix):
+        obj = obj._java_matrix
+    # TODO: Port this private PySpark function.
+    obj = pyspark.mllib.common._py2java(sc, obj)
+    return obj
+
+
+class Matrix(object):
+    """
+    Wrapper around a Java Matrix object.
+
+    Parameters
+    ----------
+    javaMatrix: JavaObject
+        A Java Matrix object as returned by calling `ml.execute().get()`.
+
+    sc: SparkContext
+        SparkContext
+    """
+    def __init__(self, javaMatrix, sc):
+        self._java_matrix = javaMatrix
+        self.sc = sc
+
+    def __repr__(self):
+        return "Matrix"
+
+    def toDF(self):
+        """
+        Convert the Matrix to a PySpark SQL DataFrame.
+
+        Returns
+        -------
+        df: PySpark SQL DataFrame
+            A PySpark SQL DataFrame representing the matrix, with
+            one "ID" column containing the row index (since Spark
+            DataFrames are unordered), followed by columns of doubles
+            for each column in the matrix.
+        """
+        jdf = self._java_matrix.asDataFrame()
+        df = _java2py(self.sc, jdf)
+        return df
 
 
 class MLResults(object):
     """
-    Wrapper around the Java ML Results object.
+    Wrapper around a Java ML Results object.
 
     Parameters
     ----------
     results: JavaObject
-        A Java MLResults object as returned by calling ml.execute()
+        A Java MLResults object as returned by calling `ml.execute()`.
 
     sc: SparkContext
         SparkContext
@@ -67,8 +157,11 @@ class Script(object):
 
     Parameters
     ----------
-    path: string
+    scriptString: string
         Can be either a file path to a DML script or a DML script itself.
+
+    scriptType: string
+        Script language, either "dml" for DML (R-like) or "pydml" for PyDML (Python-like).
     """
     def __init__(self, scriptString, scriptType="dml"):
         self.scriptString = scriptString
@@ -81,8 +174,8 @@ class Script(object):
         Parameters
         ----------
         args: name, value tuple
-            where name is a string and currently supported value formats
-            are double, string, rdds and list of such object.
+            where name is a string, and currently supported value formats
+            are double, string, dataframe, rdd, and list of such object.
 
         kwargs: dict of name, value pairs
             To know what formats are supported for name and value, look above.
@@ -99,51 +192,13 @@ class Script(object):
         """
         Parameters
         ----------
-        outputs: string, list of strings
+        names: string, list of strings
             Output variables as defined inside the DML script.
         """
         self._output.extend(names)
         return self
 
 
-def pydml(scriptString):
-    """
-    Create a pydml script object based on a string.
-
-    Parameters
-    ----------
-    scriptString: string
-        Can be a path to a pydml script or a pydml script itself.
-
-    Returns
-    -------
-    script: Script instance
-        Instance of a script object.
-    """
-    if not isinstance(scriptString, str):
-        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
-    return Script(scriptString, scriptType="pydml")
-
-
-def dml(scriptString):
-    """
-    Create a dml script object based on a string.
-
-    Parameters
-    ----------
-    scriptString: string
-        Can be a path to a dml script or a dml script itself.
-
-    Returns
-    -------
-    script: Script instance
-        Instance of a script object.
-    """
-    if not isinstance(scriptString, str):
-        raise ValueError("scriptString should be a string, got %s" % type(scriptString))
-    return Script(scriptString, scriptType="dml")
-
-
 class MLContext(object):
     """
     Wrapper around the new SystemML MLContext.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/2f3ab988/src/main/python/SystemMLtests.py
----------------------------------------------------------------------
diff --git a/src/main/python/SystemMLtests.py b/src/main/python/SystemMLtests.py
index 5dcae4a..e11a694 100644
--- a/src/main/python/SystemMLtests.py
+++ b/src/main/python/SystemMLtests.py
@@ -21,12 +21,9 @@
 #-------------------------------------------------------------
 import unittest
 
-from pyspark.sql import SQLContext
 from pyspark.context import SparkContext
 
-from SystemML import dml
-from SystemML import pydml
-from SystemML import MLContext
+from SystemML import MLContext, dml, pydml
 
 sc = SparkContext()
 ml = MLContext(sc)
@@ -47,6 +44,26 @@ class TestAPI(unittest.TestCase):
         self.assertEqual(ml.execute(script).get("x1", "x2"), [0.2, 1.2])
         self.assertEqual(ml.execute(script).get("x1", "x3"), [0.2, 2.2])
 
+    def test_output_matrix(self):
+        sums = """
+        s1 = sum(m1)
+        m2 = m1 * 2
+        """
+        rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"])
+        script = dml(sums).input(m1=rdd1).out("s1", "m2")
+        s1, m2 = ml.execute(script).get("s1", "m2")
+        self.assertEqual((s1, repr(m2)), (10.0, "Matrix"))
+
+    def test_matrix_toDF(self):
+        sums = """
+        s1 = sum(m1)
+        m2 = m1 * 2
+        """
+        rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"])
+        script = dml(sums).input(m1=rdd1).out("m2")
+        m2 = ml.execute(script).get("m2")
+        self.assertEqual(repr(m2.toDF()), "DataFrame[ID: double, C1: double, C2: double]")
+
     def test_input_single(self):
         script = """
         x2 = x1 + 1