You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2016/09/06 23:08:26 UTC

incubator-systemml git commit: [SYSTEMML-881] [WIP] Added indexing, relational and boolean operators + mathematical/trigonometric builtin functions for Python DSL

Repository: incubator-systemml
Updated Branches:
  refs/heads/master d39865e9e -> f10fa2b8f


[SYSTEMML-881] [WIP] Added indexing, relational and boolean operators + mathematical/trigonometric builtin functions for Python DSL

- Left and Right Indexing
- Relational operators (<, <=, >, >=, ==, !=)
- Boolean operators (&, |)
- Mathematical and Trignometric builtin functions: 'exp', 'log', 'abs', 'sqrt', 'round', 'floor', 'ceil', 'sin', 'cos', 'tan', 'asin', 'acos', 'atan', 'sign'

Closes #233.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/f10fa2b8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/f10fa2b8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/f10fa2b8

Branch: refs/heads/master
Commit: f10fa2b8f94ab64395585de9826696b3846aa3b7
Parents: d39865e
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Tue Sep 6 16:02:47 2016 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Tue Sep 6 16:02:47 2016 -0700

----------------------------------------------------------------------
 src/main/python/systemml/defmatrix.py | 365 ++++++++++++++++++++++++-----
 1 file changed, 303 insertions(+), 62 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f10fa2b8/src/main/python/systemml/defmatrix.py
----------------------------------------------------------------------
diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py
index 2994092..e1bd3b9 100644
--- a/src/main/python/systemml/defmatrix.py
+++ b/src/main/python/systemml/defmatrix.py
@@ -19,7 +19,9 @@
 #
 #-------------------------------------------------------------
 
-__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']
+trigFn = [ 'exp', 'log', 'abs', 'sqrt', 'round', 'floor', 'ceil', 'sin', 'cos', 'tan', 'asin', 'acos', 'atan', 'sign' ]
+__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve' ] + trigFn
+
 
 from pyspark import SparkContext
 from pyspark.sql import DataFrame, SQLContext
@@ -36,14 +38,14 @@ def setSparkContext(sc):
     sc: SparkContext
         SparkContext
     """
-    matrix.ml = MLContext(sc)
     matrix.sc = sc
-
+    matrix.ml = MLContext(matrix.sc)
 
 def checkIfMLContextIsSet():
     if matrix.ml is None:
         raise Exception('Expected setSparkContext(sc) to be called.')
 
+########################## AST related operations ##################################
 
 class DMLOp(object):
     """
@@ -55,7 +57,24 @@ class DMLOp(object):
 
     def _visit(self, execute=True):
         matrix.dml = matrix.dml + self.dml
+    
+# Special object used internally to specify the placeholder which will be replaced by output ID
+# This helps to provide dml containing output ID in constructIntermediateNode
+OUTPUT_ID = '$$OutputID$$'
 
+def constructIntermediateNode(inputs, dml):
+    """
+    Convenient utility to create an intermediate node of AST.
+    
+    Parameters
+    ----------
+    inputs = list of input matrix objects and/or DMLOp
+    dml = list of DML string (which will be eventually joined before execution). To specify out.ID, please use the placeholder  
+    """
+    dmlOp = DMLOp(inputs)
+    out = matrix(None, op=dmlOp)
+    dmlOp.dml = [out.ID if x==OUTPUT_ID else x for x in dml]
+    return out
 
 def reset():
     """
@@ -64,6 +83,23 @@ def reset():
     for m in matrix.visited:
         m.visited = False
     matrix.visited = []
+    matrix.ml = MLContext(matrix.sc)
+    matrix.dml = []
+    matrix.script = pydml('')
+
+def performDFS(outputs, execute):
+    """
+    Traverses the forest of nodes rooted at outputs nodes and returns the DML script to execute
+    """
+    for m in outputs:
+        m.output = True
+        m._visit(execute=execute)
+    return ''.join(matrix.dml)
+
+###############################################################################
+
+
+########################## Utility functions ##################################
 
 
 def binaryOp(lhs, rhs, opStr):
@@ -85,26 +121,133 @@ def binaryOp(lhs, rhs, opStr):
         rhsStr = str(rhs)
     else:
         raise TypeError('Incorrect type')
-    dmlOp = DMLOp(inputs)
-    out = matrix(None, op=dmlOp)
-    dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n']
-    return out
+    return constructIntermediateNode(inputs, [OUTPUT_ID, ' = ', lhsStr, opStr, rhsStr, '\n'])
 
+def getValue(obj):
+    if isinstance(obj, matrix):
+        return obj.ID
+    elif isinstance(obj, float) or isinstance(obj, int):
+        return str(obj)
+    else:
+        raise TypeError('Unsupported type for ' + s)
 
 def binaryMatrixFunction(X, Y, fnName):
     """
-    Common function called by supported PyDML built-in function that has two arguments both of which are matrices.
-    TODO: This needs to be generalized to support arbitrary arguments of differen types.
+    Common function called by supported PyDML built-in function that has two arguments.
     """
-    if not isinstance(X, matrix) or not isinstance(Y, matrix):
-        raise TypeError('Incorrect input type. Expected matrix type')
-    inputs = [X, Y]
-    dmlOp = DMLOp(inputs)
-    out = matrix(None, op=dmlOp)
-    dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n']
-    return out
+    return constructIntermediateNode([X, Y], [OUTPUT_ID, ' = ', fnName,'(', getValue(X), ', ', getValue(Y), ')\n'])
+
+def unaryMatrixFunction(X, fnName):
+    """
+    Common function called by supported PyDML built-in function that has one argument.
+    """
+    return constructIntermediateNode([X], [OUTPUT_ID, ' = ', fnName,'(', getValue(X), ')\n'])
+
+# utility function that converts 1:3 into DML string
+def convertSeqToDML(s):
+    ret = []
+    if s is None:
+        return ''
+    elif isinstance(s, slice):
+        if s.step is not None:
+            raise ValueError('Slicing with step is not supported.')
+        if s.start is None:
+            ret = ret + [ '0 : ' ]
+        else:
+            ret = ret + [ getValue(s.start), ':' ]
+        if s.start is None:
+            ret = ret + [ '' ]
+        else:
+            ret = ret + [ getValue(s.stop) ]
+    else:
+        ret = ret + [ getValue(s) ]
+    return ''.join(ret)
+
+# utility function that converts index (such as [1, 2:3]) into DML string
+def getIndexingDML(index):
+    ret = [ '[' ]
+    if isinstance(index, tuple) and len(index) == 1:
+        ret = ret + [ convertSeqToDML(index[0]), ',' ]
+    elif isinstance(index, tuple) and len(index) == 2:
+        ret = ret + [ convertSeqToDML(index[0]), ',', convertSeqToDML(index[1]) ]
+    else:
+        raise TypeError('matrix indexes can only be tuple of length 2. For example: m[1,1], m[0:1,], m[:, 0:1]')
+    return ret + [ ']' ]
+
+def convertOutputsToList(outputs):
+    if isinstance(outputs, matrix):
+        return [ outputs ]
+    elif isinstance(outputs, list):
+        for o in outputs:
+            if not isinstance(o, matrix):
+                raise TypeError('Only matrix or list of matrix allowed')
+        return outputs
+    else:
+        raise TypeError('Only matrix or list of matrix allowed')
+
+def resetOutputFlag(outputs):
+    for m in outputs:
+        m.output = False
+
+def populateOutputs(outputs, results, outputDF):
+    """
+    Set the attribute 'data' of the matrix by fetching it from MLResults class
+    """
+    for m in outputs:
+        if outputDF:
+            m.data = results.getDataFrame(m.ID)
+        else:
+            m.data = results.getNumPyArray(m.ID)
+
+###############################################################################
+
+########################## Global user-facing functions #######################
+
+def exp(X):
+    return unaryMatrixFunction(X, 'exp')
+    
+def log(X, y=None):
+    if y is None:
+        return unaryMatrixFunction(X, 'log')
+    else:
+        return binaryMatrixFunction(X, y, 'log')
+
+def abs(X):
+    return unaryMatrixFunction(X, 'abs')
+        
+def sqrt(X):
+    return unaryMatrixFunction(X, 'sqrt')
+    
+def round(X):
+    return unaryMatrixFunction(X, 'round')
+
+def floor(X):
+    return unaryMatrixFunction(X, 'floor')
 
+def ceil(X):
+    return unaryMatrixFunction(X, 'ceil')
+    
+def sin(X):
+    return unaryMatrixFunction(X, 'sin')
 
+def cos(X):
+    return unaryMatrixFunction(X, 'cos')
+
+def tan(X):
+    return unaryMatrixFunction(X, 'tan')
+
+def asin(X):
+    return unaryMatrixFunction(X, 'asin')
+
+def acos(X):
+    return unaryMatrixFunction(X, 'acos')
+
+def atan(X):
+    return unaryMatrixFunction(X, 'atan')
+
+def sign(X):
+    return unaryMatrixFunction(X, 'sign')
+    
 def solve(A, b):
     """
     Computes the least squares solution for system of linear equations A %*% x = b
@@ -133,39 +276,46 @@ def solve(A, b):
     """
     return binaryMatrixFunction(A, b, 'solve')
 
-
 def eval(outputs, outputDF=False, execute=True):
     """
     Executes the unevaluated DML script and computes the matrices specified by outputs.
 
     Parameters
     ----------
-    outputs: list of matrices
+    outputs: list of matrices or a matrix object
     outputDF: back the data of matrix as PySpark DataFrame
     """
     checkIfMLContextIsSet()
     reset()
-    matrix.dml = []
-    matrix.script = pydml('')
-    if isinstance(outputs, matrix):
-        outputs = [ outputs ]
-    elif not isinstance(outputs, list):
-        raise TypeError('Incorrect input type')
-    for m in outputs:
-        m.output = True
-        m._visit(execute=execute)
+    outputs = convertOutputsToList(outputs)
+    matrix.script.scriptString = performDFS(outputs, execute)
     if not execute:
-        return ''.join(matrix.dml)
-    matrix.script.scriptString = ''.join(matrix.dml)
+        resetOutputFlag(outputs)
+        return matrix.script.scriptString
     results = matrix.ml.execute(matrix.script)
-    # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array.
-    for m in outputs:
-        if outputDF:
-            m.data = results.getDataFrame(m.ID)
-        else:
-            m.data = results.getNumPyArray(m.ID)
-
-
+    populateOutputs(outputs, results, outputDF)
+    resetOutputFlag(outputs)
+            
+###############################################################################
+
+# DESIGN DECISIONS:
+# 1. Until eval() method is invoked, we create an AST (not exposed to the user) that consist of unevaluated operations and data required by those operations.
+#    As an anology, a spark user can treat eval() method similar to calling RDD.persist() followed by RDD.count().  
+# 2. The AST consist of two kinds of nodes: either of type matrix or of type DMLOp.
+#    Both these classes expose _visit method, that helps in traversing the AST in DFS manner.
+# 3. A matrix object can either be evaluated or not. 
+#    If evaluated, the attribute 'data' is set to one of the supported types (for example: NumPy array or DataFrame). In this case, the attribute 'op' is set to None.
+#    If not evaluated, the attribute 'op' which refers to one of the intermediate node of AST and if of type DMLOp.  In this case, the attribute 'data' is set to None.
+# 5. DMLOp has an attribute 'inputs' which contains list of matrix objects or DMLOp. 
+# 6. To simplify the traversal, every matrix object is considered immutable and an matrix operations creates a new matrix object.
+#    As an example: 
+#    - m1 = sml.matrix(np.ones((3,3))) creates a matrix object backed by 'data=(np.ones((3,3))'.
+#    - m1 = m1 * 2 will create a new matrix object which is now backed by 'op=DMLOp( ... )' whose input is earlier created matrix object.
+# 7. Left indexing (implemented in __setitem__ method) is a special case, where Python expects the existing object to be mutated.
+#    To ensure the above property, we make deep copy of existing object and point any references to the left-indexed matrix to the newly created object.
+#    Then the left-indexed matrix is set to be backed by DMLOp consisting of following pydml:
+#    left-indexed-matrix = new-deep-copied-matrix
+#    left-indexed-matrix[index] = value 
 class matrix(object):
     """
     matrix class is a python wrapper that implements basic matrix operator.
@@ -239,16 +389,14 @@ class matrix(object):
         matrix.systemmlVarID += 1
         self.output = False
         self.ID = 'mVar' + str(matrix.systemmlVarID)
-        if isinstance(data, SUPPORTED_TYPES):
-            self.data = data
-        elif hasattr(data, '_jdf'):
-            self.data = data
-        elif data is None and op is not None:
-            self.data = None
-            # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation
-            self.op = op
-        else:
+        self.referenced = []
+        # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation
+        self.op = op
+        self.data = data
+        if not (isinstance(data, SUPPORTED_TYPES) or hasattr(data, '_jdf') or (data is None and op is not None)):
             raise TypeError('Unsupported input type')
+        if op is not None:
+            self.referenced = self.referenced + [ op ]
 
     def eval(self, outputDF=False):
         """
@@ -287,6 +435,27 @@ class matrix(object):
             self.data = sqlContext.createDataFrame(self.toPandas())
         return self.data
 
+    def _markAsVisited(self):
+        self.visited = True
+        # for cleanup
+        matrix.visited = matrix.visited + [ self ]
+        return self
+        
+    def _registerAsInput(self, execute):
+        # TODO: Remove this when automatic registration of frame is resolved
+        matrix.dml = [ self.ID,  ' = load(\" \", format=\"csv\")\n'] + matrix.dml
+        if isinstance(self.data, DataFrame) and execute:
+            matrix.script.input(self.ID, self.data)
+        elif execute:
+            matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data))
+        return self
+        
+    def _registerAsOutput(self, execute):
+        # TODO: Remove this when automatic registration of frame is resolved
+        matrix.dml = matrix.dml + ['save(',  self.ID, ', \" \")\n']
+        if execute:
+            matrix.script.output(self.ID)
+    
     def _visit(self, execute=True):
         """
         This function is called for two scenarios:
@@ -296,26 +465,18 @@ class matrix(object):
         """
         if self.visited:
             return self
-        self.visited = True
-        # for cleanup
-        matrix.visited = matrix.visited + [ self ]
+        self._markAsVisited()
         if self.data is not None:
-            matrix.dml = matrix.dml + [ self.ID,  ' = load(\" \", format=\"csv\")\n']
-            if isinstance(self.data, DataFrame) and execute:
-                matrix.script.input(self.ID, self.data)
-            elif  execute:
-                matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data))
-            return self
+            self._registerAsInput(execute)
         elif self.op is not None:
+            # Traverse the AST
             for m in self.op.inputs:
                 m._visit(execute=execute)
             self.op._visit(execute=execute)
         else:
             raise Exception('Expected either op or data to be set')
         if self.data is None and self.output:
-            matrix.dml = matrix.dml + ['save(',  self.ID, ', \" \")\n']
-            if execute:
-                matrix.script.output(self.ID)
+            self._registerAsOutput(execute)
         return self
 
     def __repr__(self):
@@ -330,6 +491,8 @@ class matrix(object):
             print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.')
         return '<SystemML.defmatrix.matrix object>'
 
+    ######################### Arithmetic operators ######################################
+    
     def __add__(self, other):
         return binaryOp(self, other, ' + ')
 
@@ -343,8 +506,17 @@ class matrix(object):
         return binaryOp(self, other, ' // ')
 
     def __div__(self, other):
+        """
+        Performs division (Python 2 way).
+        """
         return binaryOp(self, other, ' / ')
 
+    def __truediv__(self, other):
+        """
+        Performs division (Python 3 way).
+        """
+        return binaryOp(self, other, ' / ')
+    
     def __mod__(self, other):
         return binaryOp(self, other, ' % ')
 
@@ -371,7 +543,49 @@ class matrix(object):
 
     def __rpow__(self, other):
         return binaryOp(other, self, ' ** ')
-
+        
+    def dot(self, other):
+        """
+        Numpy way of performing matrix multiplication
+        """
+        return binaryMatrixFunction(self, other, 'dot')
+    
+    def __matmul__(self, other):
+        """
+        Performs matrix multiplication (infix operator: @). See PEP 465)
+        """
+        return binaryMatrixFunction(self, other, 'dot')
+    
+    
+    ######################### Relational/Boolean operators ######################################
+    
+    def __lt__(self, other):
+        return binaryOp(other, self, ' < ')
+        
+    def __le__(self, other):
+        return binaryOp(other, self, ' <= ')
+    
+    def __gt__(self, other):
+        return binaryOp(other, self, ' > ')
+        
+    def __ge__(self, other):
+        return binaryOp(other, self, ' >= ')
+        
+    def __eq__(self, other):
+        return binaryOp(other, self, ' == ')
+        
+    def __ne__(self, other):
+        return binaryOp(other, self, ' != ')
+        
+    # TODO: Cast the output back into scalar and return boolean results
+    def __and__(self, other):
+        return binaryOp(other, self, ' & ')
+
+    def __or__(self, other):
+        return binaryOp(other, self, ' | ')
+        
+    ######################### Aggregation functions ######################################
+    
     def sum(self, axis=None):
         return self._aggFn('sum', axis)
 
@@ -410,6 +624,33 @@ class matrix(object):
         else:
             dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n']
         return out
-
-    def dot(self, other):
-        return binaryMatrixFunction(self, other, 'dot')
+        
+    ######################### Indexing operators ######################################
+    
+    def __getitem__(self, index):
+        """
+        Implements evaluation of right indexing operations such as m[1,1], m[0:1,], m[:, 0:1]
+        """
+        dmlOp = DMLOp([self])
+        out = matrix(None, op=dmlOp)
+        dmlOp.dml = [out.ID, ' = ', self.ID ] + getIndexingDML(index) + [ '\n' ]
+        return out
+    
+    # Performs deep copy if the matrix is backed by data
+    def _prepareForInPlaceUpdate(self):
+        temp = matrix(self.data, op=self.op)
+        self.ID, temp.ID = temp.ID, self.ID # Copy even the IDs as the IDs might be used to create DML
+        for op in self.referenced:
+            op.inputs.remove(self) #while self in op.inputs:
+            op.inputs = op.inputs + [ temp ]
+        self.op = DMLOp([temp], dml=[self.ID, " = ", temp.ID])
+        self.data = None       
+    
+    def __setitem__(self, index, value):
+        """
+        Implements evaluation of left indexing operations such as m[1,1]=2
+        """
+        self._prepareForInPlaceUpdate()
+        if isinstance(value, matrix): 
+            self.op.inputs = self.op.inputs + [ value ]
+        self.op.dml = self.op.dml + [ '\n', self.ID ] + getIndexingDML(index) + [ ' = ',  getValue(value), '\n']
\ No newline at end of file