You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/26 01:58:47 UTC
incubator-madlib git commit: PCA: Minor bug and doc fixes
Repository: incubator-madlib
Updated Branches:
refs/heads/master 9132145f6 -> dfeffb654
PCA: Minor bug and doc fixes
JIRA: MADLIB-948
Minor fixes:
-Added online help for pca_train and pca_sparse_train
-Unified error messages for clarity
-Fixed bug with a variance border case(1.0)
-Fixed docs to reflect correct mean table/column name
-Fixed docs to reflect the allowed ranges for components_param
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/dfeffb65
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/dfeffb65
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/dfeffb65
Branch: refs/heads/master
Commit: dfeffb6548e5816c8705131f092a50d304b671d3
Parents: 9132145
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Tue Feb 23 16:26:25 2016 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Thu Feb 25 16:56:36 2016 -0800
----------------------------------------------------------------------
src/ports/postgres/modules/pca/pca.py_in | 71 ++++++++++++++++++++++----
src/ports/postgres/modules/pca/pca.sql_in | 8 +--
2 files changed, 65 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/dfeffb65/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index 2ab4514..327dfd7 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -10,6 +10,9 @@ from linalg.matrix_ops import create_temp_sparse_matrix_table_with_dims
from linalg.matrix_ops import cast_dense_input_table_to_correct_columns
from linalg.matrix_ops import validate_dense
from linalg.matrix_ops import validate_sparse
+from linalg.svd import create_summary_table
+from linalg.svd import _svd_lower_wrap
+from linalg.svd import _svd_upper_wrap
from utilities.utilities import _array_to_string
from utilities.utilities import add_postfix
from utilities.utilities import __mad_version
@@ -17,9 +20,7 @@ from utilities.utilities import unique_string
from utilities.utilities import _assert
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_exists
-from linalg.svd import create_summary_table
-from linalg.svd import _svd_upper_wrap
-from linalg.svd import _svd_lower_wrap
+
import time
import plpy
@@ -349,14 +350,16 @@ def _validate_args(schema_madlib,
"PCA error: Source data table {0} does not exist!".
format(str(source_table)))
if not k and not variance:
- plpy.error("PCA error: components_param must be valid!")
+ plpy.error("""PCA error: components_param must be either
+ a positive integer or a float in the range (0.0,1.0]!""")
if k:
if k <= 0:
- plpy.error("PCA error: k must be a positive integer!")
+ plpy.error("""PCA error: components_param must be either
+ a positive integer or a float in the range (0.0,1.0]!""")
if variance:
- if (variance <= 0) or (variance >=1):
- plpy.error(
- "PCA error: proportion of variance has to be between 0 and 1")
+ if (variance <= 0) or (variance >1):
+ plpy.error("""PCA error: components_param must be either
+ a positive integer or a float in the range (0.0,1.0]!""")
# confirm output tables are valid
if pc_table:
_assert(not table_exists(pc_table, only_first_schema=True) and
@@ -654,7 +657,32 @@ def pca_sparse_help_message(schema_madlib, message=None, **kwargs):
relative_recon_error -- FLOAT Relative error in the approximation
""".format(schema_madlib=schema_madlib)
else:
- return """
+ if message.lower() in ("example", "examples"):
+ return """
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+ row_id integer,
+ col_id integer,
+ val_id integer
+);
+COPY sparse_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
+1|2|4
+1|5|6
+3|8|4
+5|4|2
+6|6|12
+8|1|2
+8|7|2
+9|3|4
+9|8|2
+\.
+DROP TABLE IF EXISTS result_table;
+DROP TABLE IF EXISTS result_table_mean;
+SELECT pca_sparse_train('sparse_mat', 'result_table',
+'row_id', 'col_id', 'val_id', 10, 10, 10);
+ """
+ else:
+ return """
Principal component analysis (PCA) is a mathematical procedure that uses an
orthogonal transformation to convert a set of observations of possibly
correlated variables into a set of values of linearly uncorrelated variables
@@ -730,7 +758,28 @@ def pca_help_message(schema_madlib, message=None, **kwargs):
relative_recon_error -- FLOAT Relative error in the approximation
""".format(schema_madlib=schema_madlib)
else:
- return """
+ if message.lower() in ("example", "examples"):
+ return """
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+ row_id integer,
+ row_vec double precision[]
+);
+COPY mat (row_id, row_vec) FROM stdin DELIMITER '|';
+1|{1,2,3}
+2|{2,1,2}
+3|{3,2,1}
+\.
+DROP TABLE IF EXISTS result_table;
+DROP TABLE IF EXISTS result_table_mean;
+SELECT pca_train( 'mat',
+ 'result_table',
+ 'row_id',
+ 3
+ );
+ """
+ else:
+ return """
Principal component analysis (PCA) is a mathematical procedure that uses an
orthogonal transformation to convert a set of observations of possibly
correlated variables into a set of values of linearly uncorrelated variables
@@ -742,4 +791,4 @@ constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
components.
For an overview on usage, run: SELECT {schema_madlib}.pca_train('usage');
- """.format(schema_madlib=schema_madlib)
+ """.format(schema_madlib=schema_madlib)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/dfeffb65/src/ports/postgres/modules/pca/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index acdac76..cbf921a 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -144,11 +144,11 @@ The table has the following columns:
</tr>
</table>
-The table <em>out_table</em>_means contains the column means.
+The table <em>out_table</em>_mean contains the column means.
This table has just one column:
<table class="output">
<tr>
-<th>column_means</th>
+<th>column_mean</th>
<td>A vector containing the column means for the input matrix.</td>
</tr>
</table>
@@ -181,7 +181,9 @@ components to calculate from the input data. If components_param is INTEGER,
it is used for denoting the number of principal components (<em>k</em>) to
compute. If components_param is FLOAT, the algorithm would return enough
principal vectors so that the ratio of the sum of the eigenvalues collected
-thus far to the sum of all eigenvalues is greater than this parameter.</DD>
+thus far to the sum of all eigenvalues is greater than this parameter.
+This value has to be either a positive INTEGER or a FLOAT in the range
+(0.0,1.0]</DD>
<DT>grouping_cols (optional)</DT>
<DD>TEXT, default: NULL. Currently <em>grouping_cols</em> is present as a placeholder for forward