You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ba...@apache.org on 2020/08/28 13:01:11 UTC

[systemds] branch master updated: [SYSTEMDS-2646] Python API PCA algorithm

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 0f9b4a3  [SYSTEMDS-2646] Python API PCA algorithm
0f9b4a3 is described below

commit 0f9b4a3d9dd53e615e54251327481dba44729668
Author: baunsgaard <ba...@tugraz.at>
AuthorDate: Fri Aug 28 13:25:09 2020 +0200

    [SYSTEMDS-2646] Python API PCA algorithm
    
    This commit contains the PCA algorithm in the python API.
    It also contains minor modifications to KMeans documentation in Python,
    and an update to the Python in /docs/api/.
---
 docs/api/python/api/operator/algorithms.html   |  35 ++++++++++
 docs/api/python/genindex.html                  |  11 ++++
 docs/api/python/objects.inv                    | Bin 1390 -> 1401 bytes
 docs/api/python/searchindex.js                 |   2 +-
 src/main/python/systemds/operator/algorithm.py |  48 ++++++++++++--
 src/main/python/tests/algorithms/test_pca.py   |  88 +++++++++++++++++++++++++
 6 files changed, 176 insertions(+), 8 deletions(-)

diff --git a/docs/api/python/api/operator/algorithms.html b/docs/api/python/api/operator/algorithms.html
index c616665..792d78d 100644
--- a/docs/api/python/api/operator/algorithms.html
+++ b/docs/api/python/api/operator/algorithms.html
@@ -224,6 +224,25 @@
 </pre></div>
 </div>
 <span class="target" id="module-systemds.operator.algorithm"></span><dl class="py function">
+<dt id="systemds.operator.algorithm.kmeans">
+<code class="sig-prename descclassname">systemds.operator.algorithm.</code><code class="sig-name descname">kmeans</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">x</span><span class="p">:</span> <span class="n"><a class="reference internal" href="../script_building/dag.html#systemds.script_building.dag.DAGNode" title="systemds.script_building.dag.DAGNode">systemds.script_building.dag.DAGNode</a></span></em>, <em class="sig-param"><span class="o">**</span><span [...]
+<dd><p>Performs KMeans on matrix input.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>x</strong> – Input dataset to perform K-Means on.</p></li>
+<li><p><strong>k</strong> – The number of centroids to use for the algorithm.</p></li>
+<li><p><strong>runs</strong> – The number of concurrent instances of K-Means to run (with different initial centroids).</p></li>
+<li><p><strong>max_iter</strong> – The maximum number of iterations to run the K-Means algorithm for.</p></li>
+<li><p><strong>eps</strong> – Tolerance for the algorithm to declare convergence using WCSS change ratio.</p></li>
+<li><p><strong>is_verbose</strong> – Boolean flag if the algorithm should be run in a verbose manner.</p></li>
+<li><p><strong>avg_sample_size_per_centroid</strong> – The average number of records per centroid in the data samples.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py function">
 <dt id="systemds.operator.algorithm.l2svm">
 <code class="sig-prename descclassname">systemds.operator.algorithm.</code><code class="sig-name descname">l2svm</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">x</span><span class="p">:</span> <span class="n"><a class="reference internal" href="../script_building/dag.html#systemds.script_building.dag.DAGNode" title="systemds.script_building.dag.DAGNode">systemds.script_building.dag.DAGNode</a></span></em>, <em class="sig-param"><span class="n">y</span><span c [...]
 <dd><p>Perform L2SVM on matrix with labels given.</p>
@@ -259,6 +278,22 @@
 </dl>
 </dd></dl>
 
+<dl class="py function">
+<dt id="systemds.operator.algorithm.pca">
+<code class="sig-prename descclassname">systemds.operator.algorithm.</code><code class="sig-name descname">pca</code><span class="sig-paren">(</span><em class="sig-param"><span class="n">x</span><span class="p">:</span> <span class="n"><a class="reference internal" href="../script_building/dag.html#systemds.script_building.dag.DAGNode" title="systemds.script_building.dag.DAGNode">systemds.script_building.dag.DAGNode</a></span></em>, <em class="sig-param"><span class="o">**</span><span cl [...]
+<dd><p>Performs PCA on the matrix input</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>x</strong> – Input dataset to perform K-Means on.</p></li>
+<li><p><strong>K</strong> – The number of reduced dimensions.</p></li>
+<li><p><strong>center</strong> – Boolean specifying if the input values should be centered.</p></li>
+<li><p><strong>scale</strong> – Boolean specifying if the input values should be scaled.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
 </div>
 
 
diff --git a/docs/api/python/genindex.html b/docs/api/python/genindex.html
index 0abd3a3..04f579a 100644
--- a/docs/api/python/genindex.html
+++ b/docs/api/python/genindex.html
@@ -190,6 +190,7 @@
  | <a href="#E"><strong>E</strong></a>
  | <a href="#F"><strong>F</strong></a>
  | <a href="#G"><strong>G</strong></a>
+ | <a href="#K"><strong>K</strong></a>
  | <a href="#L"><strong>L</strong></a>
  | <a href="#M"><strong>M</strong></a>
  | <a href="#N"><strong>N</strong></a>
@@ -353,6 +354,14 @@
   </ul></td>
 </tr></table>
 
+<h2 id="K">K</h2>
+<table style="width: 100%" class="indextable genindextable"><tr>
+  <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="api/operator/algorithms.html#systemds.operator.algorithm.kmeans">kmeans() (in module systemds.operator.algorithm)</a>
+</li>
+  </ul></td>
+</tr></table>
+
 <h2 id="L">L</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
@@ -447,6 +456,8 @@
       </ul></li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="api/operator/algorithms.html#systemds.operator.algorithm.pca">pca() (in module systemds.operator.algorithm)</a>
+</li>
       <li><a href="api/onnx_systemds/onnx_helper.html#systemds.onnx_systemds.onnx_helper.PreparedValue">PreparedValue (class in systemds.onnx_systemds.onnx_helper)</a>
 </li>
   </ul></td>
diff --git a/docs/api/python/objects.inv b/docs/api/python/objects.inv
index 98244a3..244bb8f 100644
Binary files a/docs/api/python/objects.inv and b/docs/api/python/objects.inv differ
diff --git a/docs/api/python/searchindex.js b/docs/api/python/searchindex.js
index a5a1046..e26f29b 100644
--- a/docs/api/python/searchindex.js
+++ b/docs/api/python/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["api/context/systemds_context","api/matrix/data_gen","api/matrix/federated","api/matrix/matrix","api/onnx_systemds/convert","api/onnx_systemds/onnx_helper","api/onnx_systemds/operator_gen","api/onnx_systemds/render","api/onnx_systemds/util","api/operator/algorithms","api/operator/operation_node","api/script_building/dag","api/script_building/script","api/utils/converters","api/utils/helpers","getting_started/install","getting_started/simple_examples","guide/alg [...]
\ No newline at end of file
+Search.setIndex({docnames:["api/context/systemds_context","api/matrix/data_gen","api/matrix/federated","api/matrix/matrix","api/onnx_systemds/convert","api/onnx_systemds/onnx_helper","api/onnx_systemds/operator_gen","api/onnx_systemds/render","api/onnx_systemds/util","api/operator/algorithms","api/operator/operation_node","api/script_building/dag","api/script_building/script","api/utils/converters","api/utils/helpers","getting_started/install","getting_started/simple_examples","guide/alg [...]
\ No newline at end of file
diff --git a/src/main/python/systemds/operator/algorithm.py b/src/main/python/systemds/operator/algorithm.py
index be6c018..77c59a5 100644
--- a/src/main/python/systemds/operator/algorithm.py
+++ b/src/main/python/systemds/operator/algorithm.py
@@ -25,7 +25,7 @@ from systemds.operator import OperationNode
 from systemds.script_building.dag import DAGNode
 from systemds.utils.consts import VALID_INPUT_TYPES
 
-__all__ = ['l2svm', 'lm']
+__all__ = ['l2svm', 'lm', 'kmeans', 'pca']
 
 
 def l2svm(x: DAGNode, y: DAGNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode:
@@ -68,15 +68,15 @@ def lm(x: DAGNode, y: DAGNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Operat
 
 def kmeans(x: DAGNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode:
     """
-    Perfoms KMeans on matrix input.
+    Performs KMeans on matrix input.
 
     :param x: Input dataset to perform K-Means on.
-    :param k: The Number of centroids to use for the algorithm.
-    :param runs: The Number of concurrent instances of K-Means to run (with different initial centroids).
-    :param max_iter: The Maximum number of iterations to run the K-Means algorithm for.
+    :param k: The number of centroids to use for the algorithm.
+    :param runs: The number of concurrent instances of K-Means to run (with different initial centroids).
+    :param max_iter: The maximum number of iterations to run the K-Means algorithm for.
     :param eps: Tolerance for the algorithm to declare convergence using WCSS change ratio.
     :param is_verbose: Boolean flag if the algorithm should be run in a verbose manner.
-    :param avg_sample_size_per_centroid: The Average Number of records per centroid in the data samples.
+    :param avg_sample_size_per_centroid: The average number of records per centroid in the data samples.
     """
 
     x._check_matrix_op()
@@ -87,8 +87,42 @@ def kmeans(x: DAGNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode:
     if 'k' in kwargs.keys() and kwargs.get('k') < 1:
         raise ValueError("Invalid number of clusters in K means, number must be integer above 0")
 
+    params_dict = {'X': x}
+    params_dict.update(kwargs)
+    return OperationNode(x.sds_context, 'kmeans', named_input_nodes=params_dict)
+
+
+def pca(x: DAGNode, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode:
+    """
+    Performs PCA on the matrix input
 
+    :param x: Input dataset to perform K-Means on.
+    :param K: The number of reduced dimensions.
+    :param center: Boolean specifying if the input values should be centered.
+    :param scale: Boolean specifying if the input values should be scaled.
+    """
+
+    x._check_matrix_op()
+    if x._np_array.size == 0:
+        raise ValueError("Found array with 0 feature(s) (shape={s}) while a minimum of 1 is required."
+                         .format(s=x._np_array.shape))
+
+    if 'K' in kwargs.keys() and kwargs.get('K') < 1:
+        raise ValueError("Invalid number of clusters in K means, number must be integer above 0")
+
+    if 'scale'in kwargs.keys():
+        if kwargs.get('scale') == True:
+            kwargs.set('scale', "TRUE")
+        elif kwargs.get('scale' == False):
+            kwargs.set('scale', "FALSE")
+
+    if 'center' in kwargs.keys():
+        if kwargs.get('center') == True:
+            kwargs.set('center', "TRUE")
+        elif kwargs.get('center' == False):
+            kwargs.set('center', "FALSE")
 
     params_dict = {'X': x}
     params_dict.update(kwargs)
-    return OperationNode(x.sds_context, 'kmeans', named_input_nodes=params_dict)
+    return OperationNode(x.sds_context, 'pca', named_input_nodes=params_dict)
+
diff --git a/src/main/python/tests/algorithms/test_pca.py b/src/main/python/tests/algorithms/test_pca.py
new file mode 100644
index 0000000..bab18af
--- /dev/null
+++ b/src/main/python/tests/algorithms/test_pca.py
@@ -0,0 +1,88 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import unittest
+
+import numpy as np
+from systemds.context import SystemDSContext
+from systemds.matrix import Matrix
+from systemds.operator.algorithm import pca
+
+
+class TestPCA(unittest.TestCase):
+
+    sds: SystemDSContext = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.sds = SystemDSContext()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sds.close()
+
+    def test_500x2(self):
+        """
+        This test constructs a line of values in 2d space. 
+        That if fit correctly maps perfectly to 1d space.
+        The check is simply if the input value was positive
+        then the output value should be similar.
+        """
+        m1 = self.generate_matrices_for_pca(30, seed=1304)
+        X = Matrix(self.sds, m1)
+        # print(features)
+        res = pca(X, K=1, scale="FALSE", center="FALSE").compute(verbose=True)
+        for (x, y) in zip(m1, res):
+            self.assertTrue((x[0] > 0 and y > 0) or (x[0] < 0 and y < 0))
+
+    def test_simple(self):
+        """
+        line of numbers. Here the pca should return values that are double or close to double of the last value
+        """
+        m1 = np.array([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
+        res = pca(Matrix(self.sds, m1), K=1,
+                  scale=False, center=False).compute()
+        for x in range(len(m1) - 1):
+            self.assertTrue(abs(res[x + 1] - res[0] * (x + 2)) < 0.001)
+
+    def test_invalid_input_1(self):
+        features = Matrix(self.sds, np.array([]))
+        with self.assertRaises(ValueError) as context:
+            pca(features)
+
+    def test_invalid_input_2(self):
+        features = Matrix(self.sds, np.array([1]))
+        with self.assertRaises(ValueError) as context:
+            pca(features, K=-1)
+
+    def generate_matrices_for_pca(self, dims: int, seed: int = 1234):
+        np.random.seed(seed)
+
+        mu, sigma = 0, 0.1
+        s = np.random.normal(mu, sigma,  dims)
+
+        m1 = np.array(np.c_[np.copy(s) * 1, np.copy(s)*0.3], dtype=np.double)
+
+        return m1
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)