You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2020/04/26 18:36:16 UTC

[systemml] branch master updated: [SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication)

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
     new 608b9e5  [SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication)
608b9e5 is described below

commit 608b9e5bfb6c612134fde25249beca10c467160e
Author: Julia Le <ju...@student.tugraz.at>
AuthorDate: Sun Apr 26 20:33:27 2020 +0200

    [SYSTEMDS-316] Extended Python API (rand, lm, matrix multiplication)
    
    Add rand(), lm and matrix multiplication to Python API
    Adapt rand testcases and add exception handling to rand function
    Add testcase for LM, update testcase for rand() and add rand testcase to
    python.yml
    Update python.yml and add simple example of lm to the documentation
    
    AMLS project SS 2020.
    Closes #892.
---
 .github/workflows/python.yml                       |   4 +-
 docs/Tasks.txt                                     |   1 +
 src/main/python/docs/source/matrix.rst             |   1 +
 src/main/python/docs/source/simple_examples.rst    |  39 ++++++
 .../python/systemds/context/systemds_context.py    |  30 ++++-
 src/main/python/systemds/matrix/matrix.py          |  31 ++++-
 src/main/python/systemds/matrix/operation_node.py  |  19 +++
 src/main/python/systemds/utils/consts.py           |   2 +-
 src/main/python/tests/test_lm.py                   |  79 ++++++++++++
 src/main/python/tests/test_matrix_binary_op.py     |   3 +
 src/main/python/tests/test_matrix_rand.py          | 140 +++++++++++++++++++++
 11 files changed, 345 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 84933ac..31af9d5 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -75,6 +75,8 @@ jobs:
   
     - name: Install pip Dependencies
       run: pip install numpy py4j wheel
+      run: pip install scipy
+      run: pip install sklearn
 
     - name: Build Python Package
       run: |
@@ -96,4 +98,4 @@ jobs:
         export SYSDS_QUIET=1
         cd src/main/python
         python -m unittest tests/lineage/*.py
-        echo "Exit Status: " $?
\ No newline at end of file
+        echo "Exit Status: " $?
diff --git a/docs/Tasks.txt b/docs/Tasks.txt
index 3a7abe7..97fa914 100644
--- a/docs/Tasks.txt
+++ b/docs/Tasks.txt
@@ -249,6 +249,7 @@ SYSTEMDS-310 Python Bindings
  * 313 Python Documentation upload via Github Actions                 OK
  * 314 Python SystemDS context manager                                OK
  * 315 Python Federated Matrices Tests                                OK
+ * 316 Extended Python API (rand, lm, mm)                             OK
 
 SYSTEMDS-320 Merge SystemDS into Apache SystemML                      OK
  * 321 Merge histories of SystemDS and SystemML                       OK
diff --git a/src/main/python/docs/source/matrix.rst b/src/main/python/docs/source/matrix.rst
index dd88c7c..e75eff4 100644
--- a/src/main/python/docs/source/matrix.rst
+++ b/src/main/python/docs/source/matrix.rst
@@ -98,3 +98,4 @@ the recommended way is to use the methods defined on ``SystemDSContext``.
 
 .. autofunction:: systemds.matrix.seq
 
+.. autofunction:: systemds.matrix.rand
\ No newline at end of file
diff --git a/src/main/python/docs/source/simple_examples.rst b/src/main/python/docs/source/simple_examples.rst
index 2175fd4..e92cfed 100644
--- a/src/main/python/docs/source/simple_examples.rst
+++ b/src/main/python/docs/source/simple_examples.rst
@@ -122,3 +122,42 @@ The output should be similar to::
    [-0.0011352 ]
    [-0.01686351]
    [-0.03839821]]
+
+SystemDS includes a built-in function lm, which solves linear regression. The lm function takes as input a matrix of
+feature vectors and a vector of response values y. The output of the function is a vector of weights.
+
+.. code-block:: python
+
+  # Import numpy and SystemDS matrix
+  import numpy as np
+  from systemds.context import SystemDSContext
+
+  # Set a seed
+  np.random.seed(0)
+  # Generate matrix of feature vectors
+  features = np.random.rand(10, 15)
+  # Generate a 1-column matrix of response values
+  y = np.random.rand(10, 1)
+
+  # compute the weights
+  with SystemDSContext() as sds:
+    weights = sds.matrix(features).lm(sds.matrix(y)).compute()
+    print(weights)
+
+The output should be similar to::
+
+  [[-0.11538199]
+  [-0.20386541]
+  [-0.39956035]
+  [ 1.04078623]
+  [ 0.4327084 ]
+  [ 0.18954599]
+  [ 0.49858968]
+  [-0.26812763]
+  [ 0.09961844]
+  [-0.57000751]
+  [-0.43386048]
+  [ 0.55358873]
+  [-0.54638565]
+  [ 0.2205885 ]
+  [ 0.37957689]]
diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py
index d5bdeb8..01f31a6 100644
--- a/src/main/python/systemds/context/systemds_context.py
+++ b/src/main/python/systemds/context/systemds_context.py
@@ -30,7 +30,7 @@ import numpy as np
 from py4j.java_gateway import JavaGateway
 from py4j.protocol import Py4JNetworkError
 
-from systemds.matrix import full, seq, federated, Matrix, OperationNode
+from systemds.matrix import full, seq, federated, Matrix, rand, OperationNode
 from systemds.utils.helpers import get_module_dir
 from systemds.utils.consts import VALID_INPUT_TYPES
 
@@ -147,3 +147,31 @@ class SystemDSContext(object):
         :return: the OperationNode representing this operation
         """
         return seq(self, start, stop, step)
+
+    def rand(self, rows: int, cols: int, min: Union[float, int] = None,
+             max: Union[float, int] = None, pdf: str = "uniform",
+             sparsity: Union[float, int] = None, seed: Union[float, int] = None,
+             lambd: Union[float, int] = 1) -> OperationNode:
+        """Generates a matrix filled with random values
+
+        :param rows: number of rows
+        :param cols: number of cols
+        :param min: min value for cells
+        :param max: max value for cells
+        :param pdf: "uniform"/"normal"/"poison" distribution
+        :param sparsity: fraction of non-zero cells
+        :param seed: random seed
+        :param lambd: lamda value for "poison" distribution
+        :return:
+        """
+        available_pdfs = ["uniform", "normal", "poisson"]
+        if rows < 0:
+            raise ValueError("In rand statement, can only assign rows a long (integer) value >= 0 "
+                            "-- attempted to assign value: {r}".format(r=rows))
+        if cols < 0:
+            raise ValueError("In rand statement, can only assign cols a long (integer) value >= 0 "
+                            "-- attempted to assign value: {c}".format(c=cols))
+        if pdf not in available_pdfs:
+            raise ValueError("The pdf passed is invalid! given: {g}, expected: {e}".format(g=pdf, e=available_pdfs))
+
+        return rand(self, rows, cols, min, max, pdf, sparsity, seed, lambd)
\ No newline at end of file
diff --git a/src/main/python/systemds/matrix/matrix.py b/src/main/python/systemds/matrix/matrix.py
index d4cb9b1..3fc1c57 100644
--- a/src/main/python/systemds/matrix/matrix.py
+++ b/src/main/python/systemds/matrix/matrix.py
@@ -19,7 +19,7 @@
 #
 #-------------------------------------------------------------
 
-__all__ = ['Matrix', 'federated', 'full', 'seq']
+__all__ = ['Matrix', 'federated', 'full', 'seq', 'rand']
 
 import os
 from typing import Union, Optional, Iterable, Dict, Tuple, Sequence, TYPE_CHECKING
@@ -142,3 +142,32 @@ def seq(sds_context: 'SystemDSContext', start: Union[float, int], stop: Union[fl
         start = 0
     unnamed_input_nodes = [start, stop, step]
     return OperationNode(sds_context, 'seq', unnamed_input_nodes)
+
+
+def rand(sds_context: 'SystemDSContext', rows: int, cols: int, min: Union[float, int] = None, max: Union[float, int] = None, pdf: str = "uniform",
+         sparsity: Union[float, int] = None, seed: Union[float, int] = None,
+         lambd: Union[float, int] = 1) -> OperationNode:
+    """Generates a matrix filled with random values
+
+    :param rows: number of rows
+    :param cols: number of cols
+    :param min: min value for cells
+    :param max: max value for cells
+    :param pdf: "uniform"/"normal"/"poison" distribution
+    :param sparsity: fraction of non-zero cells
+    :param seed: random seed
+    :param lambd: lamda value for "poison" distribution
+    :return:
+    """
+    pdf = '\"' + pdf + '\"'
+    named_input_nodes = {'rows': rows, 'cols': cols, 'pdf': pdf, 'lambda': lambd}
+    if min is not None:
+        named_input_nodes['min'] = min
+    if max is not None:
+        named_input_nodes['max'] = max
+    if sparsity is not None:
+        named_input_nodes['sparsity'] = sparsity
+    if seed is not None:
+        named_input_nodes['seed'] = seed
+
+    return OperationNode(sds_context, 'rand', [], named_input_nodes=named_input_nodes)
\ No newline at end of file
diff --git a/src/main/python/systemds/matrix/operation_node.py b/src/main/python/systemds/matrix/operation_node.py
index 9a7eff1..8a8bcae 100644
--- a/src/main/python/systemds/matrix/operation_node.py
+++ b/src/main/python/systemds/matrix/operation_node.py
@@ -159,6 +159,9 @@ class OperationNode(DAGNode):
     def __ne__(self, other):
         return OperationNode(self.sds_context, '!=', [self, other])
 
+    def __matmul__(self, other: VALID_ARITHMETIC_TYPES):
+        return OperationNode(self.sds_context, '%*%', [self, other])
+
     def l2svm(self, labels: DAGNode, **kwargs) -> 'OperationNode':
         """Perform l2svm on matrix with labels given.
 
@@ -229,3 +232,19 @@ class OperationNode(DAGNode):
             unnamed_inputs.append(weights)
         unnamed_inputs.append(moment)
         return OperationNode(self.sds_context, 'moment', unnamed_inputs, output_type=OutputType.DOUBLE)
+
+    def lm(self, y: DAGNode, **kwargs) -> 'OperationNode':
+        self._check_matrix_op()
+
+        if self._np_array.size == 0:
+            raise ValueError("Found array with 0 feature(s) (shape={s}) while a minimum of 1 is required."
+                             .format(s=self._np_array.shape))
+
+        if y._np_array.size == 0:
+            raise ValueError("Found array with 0 feature(s) (shape={s}) while a minimum of 1 is required."
+                             .format(s=y._np_array.shape))
+
+        params_dict = {'X': self, 'y': y}
+        params_dict.update(kwargs)
+
+        return OperationNode(self.sds_context, 'lm', named_input_nodes=params_dict)
\ No newline at end of file
diff --git a/src/main/python/systemds/utils/consts.py b/src/main/python/systemds/utils/consts.py
index 34506e2..4c79bda 100644
--- a/src/main/python/systemds/utils/consts.py
+++ b/src/main/python/systemds/utils/consts.py
@@ -21,7 +21,7 @@ from typing import Union
 
 MODULE_NAME = 'systemds'
 VALID_INPUT_TYPES = Union['DAGNode', str, int, float, bool]
-BINARY_OPERATIONS = ['+', '-', '/', '//', '*', '<', '<=', '>', '>=', '==', '!=']
+BINARY_OPERATIONS = ['+', '-', '/', '//', '*', '<', '<=', '>', '>=', '==', '!=', '%*%']
 # TODO add numpy array and implement for numpy array
 VALID_ARITHMETIC_TYPES = Union['DAGNode', int, float]
 
diff --git a/src/main/python/tests/test_lm.py b/src/main/python/tests/test_lm.py
new file mode 100644
index 0000000..24abd5c
--- /dev/null
+++ b/src/main/python/tests/test_lm.py
@@ -0,0 +1,79 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import warnings
+import unittest
+
+import os
+import sys
+
+import numpy as np
+
+path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../")
+sys.path.insert(0, path)
+
+from systemds.context import SystemDSContext
+from sklearn.linear_model import LinearRegression
+import random
+
+sds = SystemDSContext()
+
+regressor = LinearRegression(fit_intercept=False)
+shape = (random.randrange(1, 30), random.randrange(1, 30))
+eps = 1e-05
+
+class TestLm(unittest.TestCase):
+    def setUp(self):
+        warnings.filterwarnings(
+            action="ignore", message="unclosed", category=ResourceWarning)
+
+    def tearDown(self):
+        warnings.filterwarnings(
+            action="ignore", message="unclosed", category=ResourceWarning)
+
+    def test_lm(self):
+        X = np.random.rand(shape[0], shape[1])
+        y = np.random.rand(shape[0], 1)
+
+        try:
+            sds_model_weights = sds.matrix(X).lm(sds.matrix(y)).compute()
+            model = regressor.fit(X, y)
+
+            model.coef_ = model.coef_.reshape(sds_model_weights.shape)
+            self.assertTrue(np.allclose(sds_model_weights, model.coef_, eps))
+        except Exception as e:
+            self.assertTrue(False, "This should not raise an exception!")
+            print(e)
+
+    def test_lm_invalid_shape(self):
+        X = np.random.rand(shape[0], 0)
+        y = np.random.rand(0, 1)
+
+        try:
+            sds_model_weights = sds.matrix(X).lm(sds.matrix(y)).compute()
+            self.assertTrue(False, "An exception was expected!")
+        except Exception as e:
+            print(e)
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)
+    sds.close()
\ No newline at end of file
diff --git a/src/main/python/tests/test_matrix_binary_op.py b/src/main/python/tests/test_matrix_binary_op.py
index 95dac02..82be708 100644
--- a/src/main/python/tests/test_matrix_binary_op.py
+++ b/src/main/python/tests/test_matrix_binary_op.py
@@ -80,6 +80,9 @@ class TestBinaryOp(unittest.TestCase):
     def test_div3(self):
         self.assertTrue(np.allclose((sds.matrix(m1) / s).compute(), m1 / s))
 
+    def test_matmul(self):
+        self.assertTrue(np.allclose((sds.matrix(m1) @ sds.matrix(m2)).compute(), m1.dot(m2)))
+
     # TODO arithmetic with scala lhs
 
     def test_lt(self):
diff --git a/src/main/python/tests/test_matrix_rand.py b/src/main/python/tests/test_matrix_rand.py
new file mode 100644
index 0000000..d267bca
--- /dev/null
+++ b/src/main/python/tests/test_matrix_rand.py
@@ -0,0 +1,140 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Make the `systemds` package importable
+import os
+import sys
+import warnings
+import unittest
+import numpy as np
+import scipy.stats as st
+import random
+
+path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../")
+sys.path.insert(0, path)
+from systemds.context import SystemDSContext
+
+shape = (random.randrange(1, 50), random.randrange(1, 50))
+min_max = (0, 1)
+sparsity = 0.2
+seed = 123
+distributions = ["norm", "uniform"]
+
+sds = SystemDSContext()
+
+class TestRand(unittest.TestCase):
+    def setUp(self):
+        warnings.filterwarnings(
+            action="ignore", message="unclosed", category=ResourceWarning)
+
+    def tearDown(self):
+        warnings.filterwarnings(
+            action="ignore", message="unclosed", category=ResourceWarning)
+
+    def test_rand_shape(self):
+        m = sds.rand(rows=shape[0], cols=shape[1]).compute()
+        self.assertTrue(m.shape == shape)
+
+    def test_rand_min_max(self):
+        m = sds.rand(rows=shape[0], cols=shape[1], min=min_max[0], max=min_max[1]).compute()
+        self.assertTrue((m.min() >= min_max[0]) and (m.max() <= min_max[1]))
+
+    def test_rand_sparsity(self):
+        m = sds.rand(rows=shape[0], cols=shape[1], sparsity=sparsity, seed=seed).compute()
+        count, bins = np.histogram(m.flatten("F"))
+        non_zero_value_percent = sum(count[1:]) * 100 / sum(count)
+        e = 0.05
+
+        self.assertTrue(
+            sum(count) == (shape[0] * shape[1])
+            and (non_zero_value_percent >= (sparsity - e) * 100)
+            and (non_zero_value_percent <= (sparsity + e) * 100)
+        )
+
+    def test_rand_uniform_distribution(self):
+        m = sds.rand(
+            rows=shape[0],
+            cols=shape[1],
+            pdf="uniform",
+            min=min_max[0],
+            max=min_max[1],
+            seed=seed).compute()
+
+        dist = find_best_fit_distribution(m.flatten("F"), distributions)
+        self.assertTrue(dist == "uniform")
+
+    def test_rand_normal_distribution(self):
+        m = sds.rand(
+            rows=shape[0],
+            cols=shape[1],
+            pdf="normal",
+            min=min_max[0],
+            max=min_max[1],
+            seed=seed).compute()
+
+        dist = find_best_fit_distribution(m.flatten("F"), distributions)
+        self.assertTrue(dist == "norm")
+
+    def test_rand_zero_shape(self):
+        try:
+            m = sds.rand(rows=0, cols=0).compute()
+            self.assertTrue(np.allclose(m, np.array([[]])))
+        except Exception as e:
+            self.assertFalse("This should not raise an exception!")
+            print(e)
+
+    def test_rand_invalid_shape(self):
+        try:
+            sds.rand(rows=1, cols=-10).compute()
+            self.assertTrue(False)
+        except Exception as e:
+            print(e)
+
+    def test_rand_invalid_pdf(self):
+        try:
+            sds.rand(rows=1, cols=10, pdf="norm").compute()
+            self.assertFalse("This should've raised an exception!")
+        except Exception as e:
+            print(e)
+
+
+def find_best_fit_distribution(data, distribution_lst):
+    """
+    Finds and returns the distribution of the distributions list that fits the data the best.
+    :param data: flat numpy array
+    :param distribution_lst: distributions to check
+    :return: best distribution that fits the data
+    """
+    result = dict()
+
+    for dist in distribution_lst:
+        param = getattr(st, dist).fit(data)
+
+        D, p_value = st.kstest(data, dist, args=param)
+        result[dist] = p_value
+
+    best_dist = max(result, key=result.get)
+    return best_dist
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)
+    sds.close()
\ No newline at end of file