You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/26 01:58:47 UTC

incubator-madlib git commit: PCA: Minor bug and doc fixes

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 9132145f6 -> dfeffb654


PCA: Minor bug and doc fixes

JIRA: MADLIB-948
Minor fixes:
-Added online help for pca_train and pca_sparse_train
-Unified error messages for clarity
-Fixed bug with a variance border case(1.0)
-Fixed docs to reflect correct mean table/column name
-Fixed docs to reflect the allowed ranges for components_param


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/dfeffb65
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/dfeffb65
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/dfeffb65

Branch: refs/heads/master
Commit: dfeffb6548e5816c8705131f092a50d304b671d3
Parents: 9132145
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Tue Feb 23 16:26:25 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Thu Feb 25 16:56:36 2016 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/pca/pca.py_in  | 71 ++++++++++++++++++++++----
 src/ports/postgres/modules/pca/pca.sql_in |  8 +--
 2 files changed, 65 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/dfeffb65/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index 2ab4514..327dfd7 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -10,6 +10,9 @@ from linalg.matrix_ops import create_temp_sparse_matrix_table_with_dims
 from linalg.matrix_ops import cast_dense_input_table_to_correct_columns
 from linalg.matrix_ops import validate_dense
 from linalg.matrix_ops import validate_sparse
+from linalg.svd import create_summary_table
+from linalg.svd import _svd_lower_wrap
+from linalg.svd import _svd_upper_wrap
 from utilities.utilities import _array_to_string
 from utilities.utilities import add_postfix
 from utilities.utilities import __mad_version
@@ -17,9 +20,7 @@ from utilities.utilities import unique_string
 from utilities.utilities import _assert
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
-from linalg.svd import create_summary_table
-from linalg.svd import _svd_upper_wrap
-from linalg.svd import _svd_lower_wrap
+
 
 import time
 import plpy
@@ -349,14 +350,16 @@ def _validate_args(schema_madlib,
             "PCA error: Source data table {0} does not exist!".
             format(str(source_table)))
     if not k and not variance:
-        plpy.error("PCA error: components_param must be valid!")
+        plpy.error("""PCA error: components_param must be either
+            a positive integer or a float in the range (0.0,1.0]!""")
     if k:
         if k <= 0:
-            plpy.error("PCA error: k must be a positive integer!")
+            plpy.error("""PCA error: components_param must be either
+                a positive integer or a float in the range (0.0,1.0]!""")
     if variance:
-        if (variance <= 0) or (variance >=1):
-            plpy.error(
-                "PCA error: proportion of variance has to be between 0 and 1")
+        if (variance <= 0) or (variance >1):
+            plpy.error("""PCA error: components_param must be either
+                a positive integer or a float in the range (0.0,1.0]!""")
     # confirm output tables are valid
     if pc_table:
         _assert(not table_exists(pc_table, only_first_schema=True) and
@@ -654,7 +657,32 @@ def pca_sparse_help_message(schema_madlib, message=None, **kwargs):
             relative_recon_error   -- FLOAT     Relative error in the approximation
         """.format(schema_madlib=schema_madlib)
     else:
-        return """
+        if message.lower() in ("example", "examples"):
+            return """
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+    row_id integer,
+    col_id integer,
+    val_id integer
+);
+COPY sparse_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
+1|2|4
+1|5|6
+3|8|4
+5|4|2
+6|6|12
+8|1|2
+8|7|2
+9|3|4
+9|8|2
+\.
+DROP TABLE IF EXISTS result_table;
+DROP TABLE IF EXISTS result_table_mean;
+SELECT pca_sparse_train('sparse_mat', 'result_table',
+'row_id', 'col_id', 'val_id', 10, 10, 10);
+            """
+        else:
+            return """
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
@@ -730,7 +758,28 @@ def pca_help_message(schema_madlib, message=None, **kwargs):
             relative_recon_error   -- FLOAT     Relative error in the approximation
         """.format(schema_madlib=schema_madlib)
     else:
-        return """
+        if message.lower() in ("example", "examples"):
+            return """
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+    row_id integer,
+    row_vec double precision[]
+);
+COPY mat (row_id, row_vec) FROM stdin DELIMITER '|';
+1|{1,2,3}
+2|{2,1,2}
+3|{3,2,1}
+\.
+DROP TABLE IF EXISTS result_table;
+DROP TABLE IF EXISTS result_table_mean;
+SELECT pca_train( 'mat',
+                  'result_table',
+                  'row_id',
+                  3
+    );
+            """
+        else:
+            return """
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
@@ -742,4 +791,4 @@ constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
 components.
 
 For an overview on usage, run: SELECT {schema_madlib}.pca_train('usage');
-        """.format(schema_madlib=schema_madlib)
+            """.format(schema_madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/dfeffb65/src/ports/postgres/modules/pca/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index acdac76..cbf921a 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -144,11 +144,11 @@ The table has the following columns:
 </tr>
 </table>
 
-The table <em>out_table</em>_means contains the column means.
+The table <em>out_table</em>_mean contains the column means.
 This table has just one column:
 <table class="output">
 <tr>
-<th>column_means</th>
+<th>column_mean</th>
 <td>A vector containing the column means for the input matrix.</td>
 </tr>
 </table>
@@ -181,7 +181,9 @@ components to calculate from the input data. If components_param is INTEGER,
 it is used for denoting the number of principal components (<em>k</em>) to 
 compute. If components_param is FLOAT, the algorithm would return enough 
 principal vectors so that the ratio of the sum of the eigenvalues collected 
-thus far to the sum of all eigenvalues is greater than this parameter.</DD>
+thus far to the sum of all eigenvalues is greater than this parameter.
+This value has to be either a positive INTEGER or a FLOAT in the range 
+(0.0,1.0]</DD>
 
 <DT>grouping_cols (optional)</DT>
 <DD>TEXT, default: NULL.  Currently <em>grouping_cols</em> is present as a placeholder for forward