You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@systemds.apache.org by GitBox <gi...@apache.org> on 2021/05/10 09:30:00 UTC
[GitHub] [systemds] Baunsgaard commented on a change in pull request #1270: [WIP][Python] Create a List Class with multiple outputs and allow it to be part of the DAG

Baunsgaard commented on a change in pull request #1270:
URL: https://github.com/apache/systemds/pull/1270#discussion_r629184699



##########
File path: src/main/python/systemds/operator/operation_node.py
##########
@@ -140,17 +137,14 @@ def __parse_output_result_frame(self, result_variables, var_name):
 
     def __parse_output_result_list(self, result_variables):

Review comment:
       move to List object

##########
File path: src/main/python/systemds/operator/operation_node.py
##########
@@ -167,19 +161,21 @@ def get_lineage_trace(self) -> str:
 
     def code_line(self, var_name: str, unnamed_input_vars: Sequence[str],
                   named_input_vars: Dict[str, str]) -> str:
+        if self.operation is None:
+            return f'{var_name}={list(named_input_vars.values())[0]}'
+
         if self.operation in BINARY_OPERATIONS:
             assert len(
                 named_input_vars) == 0, 'Named parameters can not be used with binary operations'
             assert len(
                 unnamed_input_vars) == 2, 'Binary Operations need exactly two input variables'
             return f'{var_name}={unnamed_input_vars[0]}{self.operation}{unnamed_input_vars[1]}'
 
-        inputs_comma_sep = create_params_string(
-            unnamed_input_vars, named_input_vars)
+        inputs_comma_sep = create_params_string(unnamed_input_vars, named_input_vars)
 
         if self.output_type == OutputType.LIST:

Review comment:
       move to list object
   and in general i am trying to remove the logic from this object if it is not generally applicable, this is the case for lists.

##########
File path: src/main/python/systemds/script_building/dag.py
##########
@@ -74,12 +74,13 @@ class DAGNode(ABC):
     sds_context: 'SystemDSContext'
     _unnamed_input_nodes: Sequence[Union['DAGNode', str, int, float, bool]]
     _named_input_nodes: Dict[str, Union['DAGNode', str, int, float, bool]]
+    _named_output_nodes: Dict[str, Union['DAGNode', str, int, float, bool]]

Review comment:
       i would hide this away, to only allow one output, in case it is a multi output, make it a list containing the multiple nodes, that then can be extracted individually.

##########
File path: src/main/python/systemds/script_building/dag.py
##########
@@ -147,4 +148,12 @@ def script(self):
 
     @property
     def script_str(self):
-        return self._script.dml_script
\ No newline at end of file
+        return self._script.dml_script
+
+    @property
+    def dml_name(self):
+        return self._dml_name
+
+    @dml_name.setter
+    def dml_name(self, value):
+        self._dml_name = value

Review comment:
       add newlines in the end of the file.

##########
File path: src/main/python/systemds/operator/operation_node.py
##########
@@ -167,19 +161,21 @@ def get_lineage_trace(self) -> str:
 
     def code_line(self, var_name: str, unnamed_input_vars: Sequence[str],
                   named_input_vars: Dict[str, str]) -> str:
+        if self.operation is None:
+            return f'{var_name}={list(named_input_vars.values())[0]}'

Review comment:
       I'm not sure what this covers.

##########
File path: src/main/python/tests/algorithms/test_pca.py
##########
@@ -48,10 +51,36 @@ def test_500x2(self):
         m1 = self.generate_matrices_for_pca(30, seed=1304)
         X = self.sds.from_numpy( m1)
         # print(features)
-        [res, model, _, _ ] = pca(X, K=1, scale="FALSE", center="FALSE").compute()
+        [res, model, _, _] = pca(X, K=1, scale="FALSE", center="FALSE").compute()
         for (x, y) in zip(m1, res):
             self.assertTrue((x[0] > 0 and y > 0) or (x[0] < 0 and y < 0))
 
+    def test_500x2b(self):
+        """
+        This test constructs a line of values in 2d space.
+        That if fit correctly maps perfectly to 1d space.
+        The check is simply if the input value was positive
+        then the output value should be similar.
+        """
+        m1 = self.generate_matrices_for_pca(30, seed=1304)
+        node0 = self.sds.from_numpy(m1)
+        # print(features)
+        node1 = List(node0.sds_context, 'pca', named_input_nodes={"X": node0, "K": 1, "scale": "FALSE", "center": "FALSE"},
+                     outputs=[("res", OutputType.MATRIX), ("model", OutputType.MATRIX), ("scale", OutputType.MATRIX), ("center", OutputType.MATRIX)])
+        node2 = node1["res"].abs()
+        result = node2.compute(verbose=True)
+
+    def test_multiple_outputs(self):
+        # Added a second test function because test_500x2b doesn't account for the case where multiple outputs of a node which provides
+        # multiple outputs are used
+        node0 = self.sds.from_numpy(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]))
+        node1 = self.sds.from_numpy(np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]))
+        params_dict = {'X': node0, 'Y': node1}
+        node2 = List(self.sds, 'split', named_input_nodes=params_dict,
+                     outputs=[("X_train", OutputType.MATRIX), ("X_test", OutputType.MATRIX), ("Y_train", OutputType.MATRIX), ("Y_test", OutputType.MATRIX)])
+        node3 = node2["X_train"] + node2["Y_train"]

Review comment:
       this probably does not work, because the dimensions of X_train and Y_train is not equal.
   
   I would like to have a new test file only for lists. located at 
   src/main/python/tests/list/...
   
   Again i would really like having a dedicated tests that looks like this:
   
   ```Python
   m1 = self.sds.from_numpy(...)
   m2 = self.sds.from_numpy(...)
   list = self.sds.list(m1,m2)
   res = list[1].compute()
   self.assertTrue(np.allclose(m2, res))
   ```
   The important thing is that it also verify the output
   
   
   The only missing feature to make this work is the "sds.list(...)" function.
   it would look something like
   ```python 
   def list(self, *args: Sequence[VALID_INPUT_TYPES, ** kwargs: Dict[str, VALID_INPUT_TYPES]) -> 'Operation_Node':
       ...
       if kwargs not None and args not None:
           raise SOME ERROR...
       elif kwargs not None:
           return List(self, 'list', named_input_nodes=kwargs, outputs = kwargs) 
       else if args not None:
           return List(self, 'list', unnamed_input_nodes= args, outputs = args)
   
   ```
   

##########
File path: src/main/python/systemds/script_building/script.py
##########
@@ -158,10 +158,10 @@ def build_code(self, dag_root: DAGNode) -> None:
         :param dag_root: the topmost operation of our DAG, result of operation will be output
         """
         baseOutVarString = self._dfs_dag_nodes(dag_root)
-        if(dag_root.output_type != OutputType.NONE):
-            if(dag_root.number_of_outputs > 1):
+        if dag_root.output_type != OutputType.NONE:
+            if dag_root.output_type == OutputType.LIST:

Review comment:
       :+1: 

##########
File path: src/main/python/tests/algorithms/test_pca.py
##########
@@ -48,10 +51,36 @@ def test_500x2(self):
         m1 = self.generate_matrices_for_pca(30, seed=1304)
         X = self.sds.from_numpy( m1)
         # print(features)
-        [res, model, _, _ ] = pca(X, K=1, scale="FALSE", center="FALSE").compute()
+        [res, model, _, _] = pca(X, K=1, scale="FALSE", center="FALSE").compute()
         for (x, y) in zip(m1, res):
             self.assertTrue((x[0] > 0 and y > 0) or (x[0] < 0 and y < 0))
 
+    def test_500x2b(self):
+        """
+        This test constructs a line of values in 2d space.
+        That if fit correctly maps perfectly to 1d space.
+        The check is simply if the input value was positive
+        then the output value should be similar.
+        """
+        m1 = self.generate_matrices_for_pca(30, seed=1304)
+        node0 = self.sds.from_numpy(m1)
+        # print(features)
+        node1 = List(node0.sds_context, 'pca', named_input_nodes={"X": node0, "K": 1, "scale": "FALSE", "center": "FALSE"},
+                     outputs=[("res", OutputType.MATRIX), ("model", OutputType.MATRIX), ("scale", OutputType.MATRIX), ("center", OutputType.MATRIX)])
+        node2 = node1["res"].abs()
+        result = node2.compute(verbose=True)
+
+    def test_multiple_outputs(self):
+        # Added a second test function because test_500x2b doesn't account for the case where multiple outputs of a node which provides
+        # multiple outputs are used
+        node0 = self.sds.from_numpy(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]))
+        node1 = self.sds.from_numpy(np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]))
+        params_dict = {'X': node0, 'Y': node1}
+        node2 = List(self.sds, 'split', named_input_nodes=params_dict,

Review comment:
       I like that you test split!

##########
File path: src/main/python/tests/algorithms/test_pca.py
##########
@@ -48,10 +51,36 @@ def test_500x2(self):
         m1 = self.generate_matrices_for_pca(30, seed=1304)
         X = self.sds.from_numpy( m1)
         # print(features)
-        [res, model, _, _ ] = pca(X, K=1, scale="FALSE", center="FALSE").compute()
+        [res, model, _, _] = pca(X, K=1, scale="FALSE", center="FALSE").compute()
         for (x, y) in zip(m1, res):
             self.assertTrue((x[0] > 0 and y > 0) or (x[0] < 0 and y < 0))
 
+    def test_500x2b(self):
+        """
+        This test constructs a line of values in 2d space.
+        That if fit correctly maps perfectly to 1d space.
+        The check is simply if the input value was positive
+        then the output value should be similar.
+        """
+        m1 = self.generate_matrices_for_pca(30, seed=1304)
+        node0 = self.sds.from_numpy(m1)
+        # print(features)
+        node1 = List(node0.sds_context, 'pca', named_input_nodes={"X": node0, "K": 1, "scale": "FALSE", "center": "FALSE"},
+                     outputs=[("res", OutputType.MATRIX), ("model", OutputType.MATRIX), ("scale", OutputType.MATRIX), ("center", OutputType.MATRIX)])
+        node2 = node1["res"].abs()
+        result = node2.compute(verbose=True)
+
+    def test_multiple_outputs(self):
+        # Added a second test function because test_500x2b doesn't account for the case where multiple outputs of a node which provides
+        # multiple outputs are used
+        node0 = self.sds.from_numpy(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]))
+        node1 = self.sds.from_numpy(np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]))
+        params_dict = {'X': node0, 'Y': node1}
+        node2 = List(self.sds, 'split', named_input_nodes=params_dict,
+                     outputs=[("X_train", OutputType.MATRIX), ("X_test", OutputType.MATRIX), ("Y_train", OutputType.MATRIX), ("Y_test", OutputType.MATRIX)])
+        node3 = node2["X_train"] + node2["Y_train"]

Review comment:
       and this:
   
   ```python 
   m1 = self.sds.from_numpy(...)
   m2 = self.sds.from_numpy(...)
   list = self.sds.list(m1,m2)
   tmp = list[1] + 2
   res = tmp.compute()
   assertTrue(np.allclose(m2 + 2, res))
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org