You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@madlib.apache.org by nj...@apache.org on 2017/01/19 19:53:28 UTC

[1/6] incubator-madlib git commit: Improve documentation and examples

Repository: incubator-madlib
Updated Branches:
  refs/heads/master e0439ed8d -> c729602eb


Improve documentation and examples

Examples are updated to make sure they have valid Eigen vectors,
along with other minor documentation changes.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/c729602e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/c729602e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/c729602e

Branch: refs/heads/master
Commit: c729602eb60a8bab2501ebff6f128c83ffda9880
Parents: 7e57745
Author: Nandish Jayaram <nj...@users.noreply.github.com>
Authored: Wed Jan 18 14:03:52 2017 -0800
Committer: Nandish Jayaram <nj...@users.noreply.github.com>
Committed: Wed Jan 18 14:05:09 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/pca/pca.py_in        | 107 +++--
 src/ports/postgres/modules/pca/pca.sql_in       | 330 +++++++-------
 .../postgres/modules/pca/pca_project.sql_in     | 456 ++++++++++---------
 3 files changed, 441 insertions(+), 452 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c729602e/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index 5b660a8..196c558 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -783,57 +783,53 @@ The result summary table ("rslt_summary_table" above) has the following columns
 ----------------------------------------------------------------
                         Examples
 ----------------------------------------------------------------
-DROP TABLE IF EXISTS sparse_mat;
-CREATE TABLE sparse_mat (
+DROP TABLE IF EXISTS mat_sparse;
+CREATE TABLE mat_sparse (
     row_id integer,
     col_id integer,
-    val_id integer
+    value double precision
 );
-COPY sparse_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
-1|2|4
-1|5|6
-3|8|4
-5|4|2
-6|6|12
-8|1|2
-8|7|2
-9|3|4
-9|8|2
+INSERT INTO mat_sparse VALUES
+(1, 1, 1.0),
+(2, 2, 2.0),
+(3, 3, 3.0),
+(4, 4, 4.0),
+(1, 5, 5.0),
+(2, 4, 6.0),
+(3, 2, 7.0),
+(4, 3, 8.0);
 \.
 
 DROP TABLE IF EXISTS result_table_sparse;
 DROP TABLE IF EXISTS result_table_sparse_mean;
-SELECT {schema_madlib}.pca_sparse_train('sparse_mat', 'result_table_sparse',
-'row_id', 'col_id', 'val_id', 10, 10, 10);
+SELECT {schema_madlib}.pca_sparse_train('mat_sparse', 'result_table_sparse',
+'row_id', 'col_id', 'val_id', 4, 5, 3);
 
-DROP TABLE IF EXISTS sparse_mat_with_grouping;
-CREATE TABLE sparse_mat_with_grouping (
+SELECT * FROM result_table_sparse ORDER BY row_id;
+
+DROP TABLE IF EXISTS mat_sparse_group;
+CREATE TABLE mat_sparse_group (
     row_id integer,
     col_id integer,
-    val_id integer,
-    matrix_id integer
-);
-COPY sparse_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
-1|2|4|1
-1|5|6|1
-3|8|4|1
-5|4|2|1
-6|6|12|1
-8|1|2|1
-8|7|2|1
-9|3|4|1
-9|8|2|1
-1|2|4|2
-1|5|6|2
-6|6|12|2
-8|1|2|2
-8|7|2|2
+    value double precision,
+    matrix_id integer);
+INSERT INTO mat_sparse_group VALUES
+(1, 1, 1.0, 1),
+(2, 2, 2.0, 1),
+(3, 3, 3.0, 1),
+(4, 4, 4.0, 1),
+(1, 5, 5.0, 1),
+(2, 4, 6.0, 2),
+(3, 2, 7.0, 2),
+(4, 3, 8.0, 2);
 \.
 
 DROP TABLE IF EXISTS result_table_sparsed_grouped;
 DROP TABLE IF EXISTS result_table_sparsed_grouped_mean;
-SELECT {schema_madlib}.pca_sparse_train('sparse_mat_with_grouping', 'result_table_sparsed_grouped',
-'row_id', 'col_id', 'val_id', 10, 10, 10, 'matrix_id');
+SELECT {schema_madlib}.pca_sparse_train('mat_sparse_group', 'result_table_sparsed_grouped',
+'row_id', 'col_id', 'val_id', 4, 5, 0.8, 'matrix_id');
+
+SELECT * FROM result_table_sparsed_grouped ORDER BY matrix_id, row_id;
             """.format(schema_madlib=schema_madlib)
         else:
             return """
@@ -935,10 +931,10 @@ The result summary table ("rslt_summary_table" above) has the following columns
 ----------------------------------------------------------------
 DROP TABLE IF EXISTS mat;
 CREATE TABLE mat (
-    row_id integer,
+    id integer,
     row_vec double precision[]
 );
-COPY mat (row_id, row_vec) FROM stdin DELIMITER '|';
+COPY mat (id, row_vec) FROM stdin DELIMITER '|';
 1|{{1,2,3}}
 2|{{2,1,2}}
 3|{{3,2,1}}
@@ -948,38 +944,37 @@ DROP TABLE IF EXISTS result_table;
 DROP TABLE IF EXISTS result_table_mean;
 SELECT {schema_madlib}.pca_train( 'mat',
           'result_table',
-          'row_id',
+          'id',
           3
     );
+    
+SELECT * FROM result_table ORDER BY row_id;
 
-DROP TABLE IF EXISTS mat_with_grouping;
-CREATE TABLE mat_with_grouping (
-    row_id integer,
+DROP TABLE IF EXISTS mat_group;
+CREATE TABLE mat_group (
+    id integer,
     row_vec double precision[],
     matrix_id integer
 );
-COPY mat_with_grouping (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
-1|{{1,2,3}}|1
-2|{{2,1,2}}|1
-3|{{3,2,1}}|1
-4|{{1,2,3,4,5}}|2
-5|{{2,1,2,4,5}}|2
-6|{{3,2,1,4,5}}|2
-7|{{1,2,4,4,5}}|2
-8|{{1,3,3,4,5}}|2
-9|{{2,2,2,4,5}}|2
-10|{{3,3,2,4,5}}|2
-11|{{2,2,3,4,5}}|2
+INSERT INTO mat_group VALUES
+(1, '{{1,2,3}}', 1),
+(2, '{{2,1,2}}', 1),
+(3, '{{3,2,1}}', 1),
+(4, '{{1,2,3,4,5}}', 2),
+(5, '{{2,5,2,4,1}}', 2),
+(6, '{{5,4,3,2,1}}', 2);
 \.
 
 DROP TABLE IF EXISTS result_table_grp;
 DROP TABLE IF EXISTS result_table_grp_mean;
-SELECT {schema_madlib}.pca_train( 'mat_with_grouping',
+SELECT {schema_madlib}.pca_train( 'mat_group',
           'result_table_grp',
           'row_id',
           0.9,
           'matrix_id'
     );
+
+SELECT * FROM result_table_grp ORDER BY matrix_id, row_id;
             """.format(schema_madlib=schema_madlib)
         else:
             return """

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c729602e/src/ports/postgres/modules/pca/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index 9de5559..6bcce1c 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -208,12 +208,23 @@ variance feature was introduced.  A special case to be aware of:
 'components_param' = 1 (INTEGER) will return 1 principal
 component, but 'components_param' = 1.0 (FLOAT) will return all 
 principal components, i.e., proportion of variance of 100%.
+\n \n
+Also, please note that the number of principal components (<em>k</em>)
+is global, even in the case where grouping is used (see 'grouping_cols'
+below).  In the case of grouping, proportion of variance 
+might be a better choice; this could result in different numbers
+of principal components for different groups.
 
 <DT>grouping_cols (optional)</DT>
 <DD>TEXT, default: NULL. A comma-separated list of column names, with the
 source data grouped using the combination of all the columns. An independent
 PCA model will be computed for each combination of the grouping columns.</DD>
 
+@note Dense matrices can be different sizes for different groups if desired.  
+Sparse matrices cannot be different sizes for different groups,
+because the 'row_dim' and 'col_dim' parameters used for sparse matrices 
+are global across all groups.
+
 <DT>lanczos_iter (optional)</DT>
 <DD>INTEGER, default: minimum of {<em>k+40</em>, smallest matrix dimension}
 where <em>k</em> is the number of principal components specified in the 
@@ -275,7 +286,7 @@ This sumary table has the following columns:
 @anchor examples
 @examp
 
--# View online help for the PCA training function:
+-# View online help for the PCA training functions:
 <pre class="example">
 SELECT madlib.pca_train();
 or
@@ -285,10 +296,9 @@ SELECT madlib.pca_sparse_train();
 -# Create sample data in dense matrix form:
 <pre class="example">
 DROP TABLE IF EXISTS mat;
-CREATE TABLE mat (
-    			row_id integer,
-    			row_vec double precision[]
-);
+CREATE TABLE mat (id integer,
+                  row_vec double precision[]
+                  );
 INSERT INTO mat VALUES
 (1, '{1,2,3}'),
 (2, '{2,1,2}'),
@@ -298,202 +308,180 @@ INSERT INTO mat VALUES
 -# Run the PCA function for a specified number of principal components and view the results:
 <pre class="example">
 DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_train( 'mat',
-                        'result_table',
-                        'row_id',
-                         3);
-SELECT * FROM result_table;
+SELECT madlib.pca_train('mat',             -- Source table
+                        'result_table',    -- Output table
+                        'id',              -- Row id of source table
+                         2);               -- Number of principal components
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                     principal_components                     |       std_dev        |      proportion      
---------+--------------------------------------------------------------+----------------------+----------------------
-      1 | {-0.707106781186547,-1.6306400674182e-16,0.707106781186547}  |     1.41421356237309 |    0.857142857142245
-      2 | {-1.66533453693773e-16,1,5.55111512312578e-17}               |    0.577350269189626 |    0.142857142857041
-      3 | {-0.707106781186548,1.11022302462516e-16,-0.707106781186547} | 1.59506745224211e-16 | 1.09038864737157e-32
+ row_id |                     principal_components                     |      std_dev      |    proportion     
+--------+--------------------------------------------------------------+-------------------+-------------------
+      1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} |  1.41421356237309 | 0.857142857142244
+      2 | {0,1,0}                                                      | 0.577350269189626 | 0.142857142857041
+(2 rows)
 </pre>
 
 -# Run the PCA function for a specified proportion of variance and view the results:
 <pre class="example">
+%%sql
 DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_train( 'mat',
-                         'result_table',
-                         'row_id',
-                          0.9);
-SELECT * FROM result_table;
+SELECT madlib.pca_train('mat',             -- Source table
+                        'result_table',    -- Output table
+                        'id',              -- Row id of source table
+                         0.9);             -- Proportion of variance
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
  row_id |                     principal_components                     |      std_dev      |    proportion     
 --------+--------------------------------------------------------------+-------------------+-------------------
-      1 | {-0.707106781186548,-3.46944695195361e-17,0.707106781186548} |   1.4142135623731 | 0.857142857142245
-      2 | {2.22044604925031e-16,-1,1.11022302462516e-16}               | 0.577350269189626 | 0.142857142857041
+      1 | {0.707106781186548,-2.77555756156289e-17,-0.707106781186548} |   1.4142135623731 | 0.857142857142245
+      2 | {-1.11022302462516e-16,-1,0}                                 | 0.577350269189626 | 0.142857142857041
+(2 rows)
 </pre>
 
--# An example of using grouping_cols to learn different models for each group.
-Create sample data in dense matrix form:
+-# Now we use grouping in dense form to learn different models for different groups.
+First, we create sample data in dense matrix form with a grouping column.
+Note we actually have different matrix sizes for the different groups, which 
+is allowed for dense:
 <pre class="example">
-DROP TABLE IF EXISTS mat_with_grouping;
-CREATE TABLE mat_with_grouping (
-    row_id integer,
+DROP TABLE IF EXISTS mat_group;
+CREATE TABLE mat_group (
+    id integer,
     row_vec double precision[],
     matrix_id integer
 );
-COPY mat_with_grouping (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
-1|{1,2,3}|1
-2|{2,1,2}|1
-3|{3,2,1}|1
-4|{1,2,3,4,5}|2
-5|{2,1,2,4,5}|2
-6|{3,2,1,4,5}|2
-7|{1,2,4,4,5}|2
-8|{1,3,3,4,5}|2
-9|{2,2,2,4,5}|2
-10|{3,3,2,4,5}|2
-11|{2,2,3,4,5}|2
-\.
+INSERT INTO mat_group VALUES
+(1, '{1,2,3}', 1),
+(2, '{2,1,2}', 1),
+(3, '{3,2,1}', 1),
+(4, '{1,2,3,4,5}', 2),
+(5, '{2,5,2,4,1}', 2),
+(6, '{5,4,3,2,1}', 2);
+</pre>
 
--# Run the PCA function for a specified number of principal components and view the results:
+-# Run the PCA function with grouping for a specified proportion of variance and view the results:
 <pre class="example">
-DROP TABLE IF EXISTS result_table_grp, result_table_grp_mean;
-SELECT {schema_madlib}.pca_train( 'mat_with_grouping',
-          'result_table_grp',
-          'row_id',
-          0.9,
-          'matrix_id'
-    );
-SELECT * FROM result_table_grp;
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_train('mat_group',             -- Source table
+                        'result_table_group',    -- Output table
+                        'id',                    -- Row id of source table
+                         0.8,                    -- Proportion of variance
+                        'matrix_id');            -- Grouping column
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |                                         principal_components                                         |      std_dev      |    proportion     | matrix_id
---------+------------------------------------------------------------------------------------------------------+-------------------+-------------------+-----------
-      1 | {-0.707106781186547,1.7650811368064e-16,0.707106781186547}                                           |   1.4142135623731 | 0.857142857142245 |         1
-      2 | {1.38777878078145e-17,1,4.16333634234434e-17}                                                        | 0.577350269189625 | 0.142857142857041 |         1
-      1 | {0.6597973999259,-0.0411675274295868,-0.750315017666817,-8.77661658627016e-17,-1.98408997564847e-16} |  1.19430528584267 | 0.726149513493974 |         2
-      2 | {-0.608369791899382,0.114069335981134,0.608369791899382,0.0380149274618831,0.495293291381477}        |  1.03132841282174 | 0.541488586593004 |         2
+ row_id |                                      principal_components                                      |     std_dev     |    proportion     | matrix_id 
+--------+------------------------------------------------------------------------------------------------+-----------------+-------------------+-----------
+      1 | {0.707106781186548,0,-0.707106781186547}                                                       | 1.4142135623731 | 0.857142857142245 |         1
+      1 | {-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023} | 3.2315220311722 | 0.764102534485173 |         2
+      2 | {0.587384101786277,-0.485138064894743,0.311532046315153,-0.449458074050715,0.347212037159181}  |  1.795531127192 | 0.235897465516047 |         2
+(3 rows)
 </pre>
 
--# Create sample data in sparse matrix form:
+-# Now let's look at sparse matrices.  Create sample data in sparse matrix form:
 <pre class="example">
-DROP TABLE IF EXISTS sparse_mat;
-CREATE TABLE sparse_mat (
-                        row_id integer,
-                        col_id integer,
-                        val_id integer
-                        );
-INSERT INTO sparse_mat VALUES
-(1, 2, 4.0),
-(1, 5, 6.0),
-(3, 8, 4.0),
-(5, 4, 2.0),
-(6, 6, 12.0),
-(8, 1, 2.0),
-(8, 7, 2.0),
-(9, 3, 4.0),
-(9, 8, 2.0);
+DROP TABLE IF EXISTS mat_sparse;
+CREATE TABLE mat_sparse (
+    row_id integer,
+    col_id integer,
+    value double precision
+);
+INSERT INTO mat_sparse VALUES
+(1, 1, 1.0),
+(2, 2, 2.0),
+(3, 3, 3.0),
+(4, 4, 4.0),
+(1, 5, 5.0),
+(2, 4, 6.0),
+(3, 2, 7.0),
+(4, 3, 8.0);
 </pre>
-
--# This matrix is what this matrix looks like in dense form:
+As an aside, this is what the sparse matrix above looks like when 
+put in dense form:
 <pre class="example">
-DROP TABLE IF EXISTS dense_mat;
-SELECT madlib.matrix_densify(
-							'sparse_mat',
-							'row=row_id, col=col_id, val=val_id',
-							'dense_mat');
-SELECT * FROM dense_mat order by row_id;
+DROP TABLE IF EXISTS mat_dense;
+SELECT madlib.matrix_densify('mat_sparse', 
+                            'row=row_id, col=col_id, val=value', 
+                            'mat_dense');
+SELECT * FROM mat_dense ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |       val_id       
---------+--------------------
-      1 | {0,4,0,0,6,0,0,0}
-      2 | {0,0,0,0,0,0,0,0}
-      3 | {0,0,0,0,0,0,0,4}
-      4 | {0,0,0,0,0,0,0,0}
-      5 | {0,0,0,2,0,0,0,0}
-      6 | {0,0,0,0,0,12,0,0}
-      7 | {0,0,0,0,0,0,0,0}
-      8 | {2,0,0,0,0,0,2,0}
-      9 | {0,0,4,0,0,0,0,2}
+ row_id |    value    
+--------+-------------
+      1 | {1,0,0,0,5}
+      2 | {0,2,0,6,0}
+      3 | {0,7,3,0,0}
+      4 | {0,0,8,4,0}
+(4 rows)
 </pre>
 
 -# Run the PCA sparse function for a specified number of principal components and view the results:
-<pre class="example">
-DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_sparse_train(
-                                'sparse_mat',
-                                'result_table',
-                                'row_id',
-                                'col_id',
-                                'val_id',
-                                9,
-                                8,
-                                5);
-SELECT * FROM result_table;
+<pre class="example">DROP TABLE IF EXISTS result_table, result_table_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse',       -- Source table
+                                'result_table',     -- Output table
+                                'row_id',           -- Row id of source table
+                                'col_id',           -- Column id of source table
+                                'value',            -- Value of matrix at row_id, col_id
+                                4,                  -- Actual number of rows in the matrix
+                                5,                  -- Actual number of columns in the matrix
+                                3);                 -- Number of principal components                            
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
 Result (with principal components truncated for readability):
 <pre class="result">
- row_id |           principal_components              |      std_dev      |     proportion     
---------+---------------------------------------------+-------------------+----------------------
-      1 | {0.0189854059340971,0.0593979357345431,\u2026    |  4.03069474374092 |  0.604208682045711
-      2 | {0.0346801706473592,-0.536234300404824,\u2026    |  2.42282285507368 |  0.218308410262949
-      3 | {0.166190350977087,-0.112693750915351,\u2026     |  1.54680674776235 | 0.0889814051004931
-      4 | {-0.0699448377725649,0.00569475043252321,\u2026  |  1.10233418049845 | 0.0451911810308358
-      5 | {0.645363366217337,0.0403370697192613,\u2026     | 0.906957663197704 | 0.0305915282045503
+ row_id |         principal_components                 |     std_dev      |    proportion     
+--------+----------------------------------------------+------------------+-------------------
+      1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 0.436590030617467
+      2 | {-0.0647272661608605,0.877639526308692,...   | 3.68408023747461 | 0.333748701544697
+      3 | {-0.0780380267884855,0.177956517174911,...   | 3.05606908060098 | 0.229661267837836
+(3 rows)
 </pre>
 
--# Create sample data in sparse matrix form, with a grouping column:
+-# Now we use grouping in sparse form to learn different models for different groups.
+First, we create sample data in sparse matrix form with a grouping column:
 <pre class="example">
-DROP TABLE IF EXISTS sparse_mat_with_grouping;
-CREATE TABLE sparse_mat_with_grouping (
+DROP TABLE IF EXISTS mat_sparse_group;
+CREATE TABLE mat_sparse_group (
     row_id integer,
     col_id integer,
-    val_id integer,
-    matrix_id integer
-);
-COPY sparse_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
-1|2|4|1
-1|5|6|1
-3|8|4|1
-5|4|2|1
-6|6|12|1
-8|1|2|1
-8|7|2|1
-9|3|4|1
-9|8|2|1
-1|2|4|2
-1|5|6|2
-6|6|12|2
-8|1|2|2
-8|7|2|2
-\.
+    value double precision,
+    matrix_id integer);
+INSERT INTO mat_sparse_group VALUES
+(1, 1, 1.0, 1),
+(2, 2, 2.0, 1),
+(3, 3, 3.0, 1),
+(4, 4, 4.0, 1),
+(1, 5, 5.0, 1),
+(2, 4, 6.0, 2),
+(3, 2, 7.0, 2),
+(4, 3, 8.0, 2);
 </pre>
 
--#  Run the PCA sparse function for a specified number of principal components and view the results:
+-#  Run the PCA function with grouping for a specified proportion of variance
+and view the results:
 <pre class="example">
-DROP TABLE IF EXISTS result_table_grouped, result_table_grouped_mean;
-SELECT {schema_madlib}.pca_sparse_train (
-        'sparse_mat_with_grouping',
-        'result_table_grouped',
-        'row_id',
-        'col_id',
-        'val_id',
-        10,
-        10,
-        0.9,
-        'matrix_id'
-    );
-
-SELECT * FROM result_table_grouped;
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse_group',   -- Source table
+                                'result_table_group', -- Output table
+                                'row_id',             -- Row id of source table
+                                'col_id',             -- Column id of source table
+                                'value',              -- Value of matrix at row_id, col_id
+                                4,                    -- Actual number of rows in the matrix
+                                5,                    -- Actual number of columns in the matrix
+                                0.8,                  -- Proportion of variance
+                                'matrix_id');
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
 Result (with principal components truncated for readability):
 <pre class="result">
- row_id |         principal_components                 |     std_dev      |     proportion     | matrix_id
---------+----------------------------------------------+------------------+--------------------+-----------
-      1 | {0.0169618842872617,0.0374636722968646,...   | 3.81515913912922 |  0.600912629871244 |         1
-      2 | {-0.0290315981382334,-0.0981953629958789,... |   1.925316518345 |  0.153034831478482 |         1
-      3 | {-0.0321284612209643,-0.396464491482199,...  | 1.42677134361186 | 0.0840416890026057 |         1
-      4 | {-4.61214190235381e-16,0.707106781186545,... | 1.33333333333333 | 0.0733944954128442 |         1
-      1 | {0.0178733514257156,0.0394965128556239,...   | 3.80635791249958 |  0.710213752845497 |         2
-      2 | {0.0417974843144143,0.144805266892335,...    | 1.89702728276027 |  0.176407476055726 |         2
-      3 | {-1.3150650111236,7.0774402964125e-17,...    | 1.69008042534288 |  0.140018227653292 |         2
+ row_id |           principal_components             |     std_dev      |    proportion     | matrix_id 
+--------+--------------------------------------------+------------------+-------------------+-----------
+      1 | {-0.17805696611353,0.0681313257646983,...  | 2.73659933165925 | 0.544652792875481 |         1
+      2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 0.308800210823714 |         1
+      1 | {0,-0.479486114660443,...                  | 4.40325305087975 | 0.520500333693473 |         2
+      2 | {0,0.689230898585949,...                   |  3.7435566458567 | 0.376220573442628 |         2
+(4 rows)
 </pre>
 
 @anchor notes
@@ -548,13 +536,13 @@ recovered as the entries of \f$ {\boldsymbol \Sigma}/(\sqrt{(N-1)} \f$, and the
 components are the rows of  \f$ {\boldsymbol V} \f$. The reasoning behind using N \u2212 1 instead of N to calculate the covariance is <a href="https://en.wikipedia.org/wiki/Bessel%27s_correction">Bessel's correction</a>.
 
 
-It is important to note that the PCA implementation assumes that the user will
- use only the principal components that have non-zero eigenvalues.  The SVD
- calculation is done with the Lanczos method, with does not guarantee
- correctness for singular vectors with zero-valued eigenvalues.  Consequently,
-  principal components with zero-valued eigenvalues are not guaranteed to be correct.
- Generally, this will not be problem unless the user wants to use the
- principal components for the entire eigenspectrum.
+@note It is important to note that this PCA implementation assumes that the user will
+use only the principal components that have non-zero eigenvalues.  The SVD
+calculation is done with the Lanczos method, which does not guarantee
+correctness for singular vectors with zero-valued eigenvalues.  Consequently,
+principal components with zero-valued eigenvalues are not guaranteed to be correct.
+Generally, this will not be problem unless the user wants to use the
+principal components for the entire eigenspectrum.
 
 
 @anchor literature
@@ -605,14 +593,14 @@ m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 CREATE OR REPLACE FUNCTION
 MADLIB_SCHEMA.pca_train(
-    source_table    TEXT,   -- Source table name (dense matrix)
-    pc_table        TEXT,   -- Output table name for the principal components
-    row_id          TEXT,   -- Column name for the ID for each row
-    k               INTEGER,-- Number of principal components to compute
-    grouping_cols   TEXT,   -- Comma-separated list of grouping columns
-    lanczos_iter    INTEGER,-- The number of Lanczos iterations for the SVD calculation
-    use_correlation BOOLEAN, -- If True correlation matrix is used for principal components
-    result_summary_table  TEXT    -- Table name to store summary of results (Default: NULL)
+    source_table    TEXT,       -- Source table name (dense matrix)
+    pc_table        TEXT,       -- Output table name for the principal components
+    row_id          TEXT,       -- Column name for the ID for each row
+    k               INTEGER,    -- Number of principal components to compute
+    grouping_cols   TEXT,       -- Comma-separated list of grouping columns
+    lanczos_iter    INTEGER,    -- The number of Lanczos iterations for the SVD calculation
+    use_correlation BOOLEAN,    -- If True correlation matrix is used for principal components
+    result_summary_table  TEXT  -- Table name to store summary of results (Default: NULL)
 )
 RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.pca_train($1, $2, $3, $4, $5, $6, $7, $8, NULL)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/c729602e/src/ports/postgres/modules/pca/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.sql_in b/src/ports/postgres/modules/pca/pca_project.sql_in
index 0a0a69c..1023569 100644
--- a/src/ports/postgres/modules/pca/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/pca_project.sql_in
@@ -180,297 +180,297 @@ The <em>result_summary_table</em> contains information about the performance tim
 SELECT madlib.pca_project();
 </pre>
 
--# Create the sample data:
+-# Create sample data in dense matrix form:
 <pre class="example">
 DROP TABLE IF EXISTS mat;
-CREATE TABLE mat (
-    row_id integer,
-    row_vec double precision[]
-);
+CREATE TABLE mat (id integer,
+                  row_vec double precision[]
+                  );
 INSERT INTO mat VALUES
-(1, ARRAY[4,7,5]),
-(2, ARRAY[1,2,5]),
-(3, ARRAY[7,4,4]),
-(4, ARRAY[9,2,4]),
-(5, ARRAY[8,5,7]),
-(6, ARRAY[0,5,5]);
+(1, '{1,2,3}'),
+(2, '{2,1,2}'),
+(3, '{3,2,1}');
 </pre>
--# Run the PCA function and keep only the top two principle components:
+
+-# Run the PCA function for a specified number of principal components and view the results:
 <pre class="example">
 DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_train ( 'mat',
-                        'result_table',
-                        'row_id',
-                        2
-                        );
-SELECT * FROM result_table;
+SELECT madlib.pca_train('mat',             -- Source table
+                        'result_table',    -- Output table
+                        'id',              -- Row id of source table
+                         2);               -- Number of principal components
+SELECT * FROM result_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                    principal_components                     |     std_dev      |    proportion     
---------+-------------------------------------------------------------+------------------+-------------------
-      1 | {-0.995725178022077,0.0921925100100751,0.00564897786728818} |  3.7757341932744 | 0.745095924995985
-      2 | {0.0900421470942019,0.955231794466691,0.281823758023718}    | 1.97776947254051 | 0.204437565497132
+ row_id |                     principal_components                     |      std_dev      |    proportion     
+--------+--------------------------------------------------------------+-------------------+-------------------
+      1 | {0.707106781186547,-6.93889390390723e-18,-0.707106781186548} |  1.41421356237309 | 0.857142857142244
+      2 | {0,1,0}                                                      | 0.577350269189626 | 0.142857142857041
+(2 rows)
 </pre>
--# Project the original data into a low-dimensional representation:
+
+-# Project the original data to a lower dimensional representation and view the result of the projection:
 <pre class="example">
 DROP TABLE IF EXISTS residual_table, result_summary_table, out_table;
 SELECT madlib.pca_project( 'mat',
-                            'result_table',
-                            'out_table',
-                            'row_id',
-                            'residual_table',
-                            'result_summary_table'
-                            );
-</pre>
--# View dense matrix with the projection onto the principal components:
-<pre class="example">
+                           'result_table',
+                           'out_table',
+                           'id',
+                           'residual_table',
+                           'result_summary_table'
+                           );
 SELECT * FROM out_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                row_vec                
---------+---------------------------------------
-      1 | {1.09098309337665,-2.63145496174091}
-      2 | {3.61719607739251,2.41483045187516}
-      3 | {-2.17841894858709,0.24593773840028}
-      4 | {-4.3542543246514,1.97631703314526}
-      5 | {-3.06500468299723,-1.64480747723177}
-      6 | {4.88949878544481,-0.360822784430716}
+ row_id |               row_vec                
+--------+--------------------------------------
+      1 | {-1.41421356237309,-0.33333333333}
+      2 | {2.77555756157677e-17,0.66666666667}
+      3 | {1.41421356237309,-0.33333333333}
+(3 rows)
 </pre>
--# Check the error in the projection:
+Check the error in the projection:
 <pre class="example">
 SELECT * FROM result_summary_table;
 </pre>
 <pre class="result">
-   exec_time   | residual_norm | relative_residual_norm 
----------------+---------------+------------------------
- 56.8881034851 | 2.19726255664 |         0.099262204234
+   exec_time   |   residual_norm   | relative_residual_norm 
+---------------+-------------------+------------------------
+ 331.792116165 | 5.89383520611e-16 |      9.68940539229e-17
+(1 row)
 </pre>
--# Check the residuals:
+Check the residuals:
 <pre class="example">
 SELECT * FROM residual_table ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                          row_vec                           
---------+------------------------------------------------------------
-      1 | {0.0160441468047001,0.219103418411008,-0.747769465736052}
-      2 | {-0.0141636064722857,-0.193422226365899,0.660123132354738}
-      3 | {0.0197048332985021,0.269094791232932,-0.918383061897929}
-      4 | {0.00897783310632772,0.122603834748268,-0.418429820364218}
-      5 | {-0.0333376637524658,-0.455268589780183,1.55376831915842}
-      6 | {0.00277445701511336,0.037888771752409,-0.129309103509956}
+ row_id |                              row_vec                               
+--------+--------------------------------------------------------------------
+      1 | {-2.22044604925031e-16,-1.11022302462516e-16,3.33066907387547e-16}
+      2 | {-1.12243865646685e-18,0,4.7381731349413e-17}
+      3 | {2.22044604925031e-16,1.11022302462516e-16,-3.33066907387547e-16}
+(3 rows)
 </pre>
--# Run the PCA project function on subsets of an input table based on grouping columns.
-Note that the 'pc_table' used for projection must also be generated using the same
-grouping columns. Create sample data:
+
+-# Now we use grouping in dense form to learn different models for different groups.
+First, we create sample data in dense matrix form with a grouping column.
+Note we actually have different matrix sizes for the different groups, which 
+is allowed for dense:
 <pre class="example">
-DROP TABLE IF EXISTS mat_with_grouping;
-CREATE TABLE mat_with_grouping (
-    row_id integer,
+DROP TABLE IF EXISTS mat_group;
+CREATE TABLE mat_group (
+    id integer,
     row_vec double precision[],
     matrix_id integer
 );
-COPY mat_with_grouping (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
-1|{1,2,3}|1
-2|{2,1,2}|1
-3|{3,2,1}|1
-4|{1,2,3,4,5}|2
-5|{2,1,2,4,5}|2
-6|{3,2,1,4,5}|2
-7|{1,2,4,4,5}|2
-8|{1,3,3,4,5}|2
-9|{2,2,2,4,5}|2
-10|{3,3,2,4,5}|2
-11|{2,2,3,4,5}|2
-\.
+INSERT INTO mat_group VALUES
+(1, '{1,2,3}', 1),
+(2, '{2,1,2}', 1),
+(3, '{3,2,1}', 1),
+(4, '{1,2,3,4,5}', 2),
+(5, '{2,5,2,4,1}', 2),
+(6, '{5,4,3,2,1}', 2);
 </pre>
--# Run the PCA function using grouping_cols and capture 90% of the proportion of variance:
+
+-# Run the PCA function with grouping for a specified proportion of variance and view the results:
 <pre class="example">
-DROP TABLE IF EXISTS result_table_grp;
-DROP TABLE IF EXISTS result_table_grp_mean;
-SELECT madlib.pca_train( 'mat_with_grouping',
-          'result_table_grp',
-          'row_id',
-          0.9,
-          'matrix_id'
-    );
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_train('mat_group',             -- Source table
+                        'result_table_group',    -- Output table
+                        'id',                    -- Row id of source table
+                         0.8,                    -- Proportion of variance
+                        'matrix_id');            -- Grouping column
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
--# Project the original data to low-dimensional representation, based on independent PCA models
-learnt for each group:
-<pre class="example">
-DROP TABLE IF EXISTS mat_with_grouping_projected;
-SELECT madlib.pca_project(
-    'mat_with_grouping',
-    'result_table_grp',
-    'mat_with_grouping_projected',
-    'row_id'
-    );
+<pre class="result">
+ row_id |                                      principal_components                                      |     std_dev     |    proportion     | matrix_id 
+--------+------------------------------------------------------------------------------------------------+-----------------+-------------------+-----------
+      1 | {0.707106781186548,0,-0.707106781186547}                                                       | 1.4142135623731 | 0.857142857142245 |         1
+      1 | {-0.555378486712784,-0.388303582074091,0.0442457354870796,0.255566375612852,0.688115693174023} | 3.2315220311722 | 0.764102534485173 |         2
+      2 | {0.587384101786277,-0.485138064894743,0.311532046315153,-0.449458074050715,0.347212037159181}  |  1.795531127192 | 0.235897465516047 |         2
+(3 rows)
 </pre>
--# Check the error in the projection:
+
+-# Run the PCA projection on subsets of an input table based on grouping columns. 
+Note that the parameter 'pc_table' used for projection must be generated in training 
+using the same grouping columns. 
 <pre class="example">
-SELECT * FROM mat_with_grouping_projected;
+DROP TABLE IF EXISTS mat_group_projected;
+SELECT madlib.pca_project('mat_group',
+                          'result_table_group',
+                          'mat_group_projected',
+                          'id');
+SELECT * FROM mat_group_projected ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |                row_vec                 | matrix_id
---------+----------------------------------------+-----------
-      1 | {1.41421356237309,0.33333333333}       |         1
-      2 | {1.08709337828423e-16,-0.66666666667}  |         1
-      3 | {-1.41421356237309,0.33333333333}      |         1
-      4 | {-0.947334292839873,0.757198329919689} |         2
-      5 | {0.503945652182431,-0.357198389962114} |         2
-      6 | {1.87289054234556,-1.63346278982675}   |         2
-      7 | {-1.69764931050669,1.55408536983517}   |         2
-      8 | {-0.98850182026946,0.676264489928273}  |         2
-      9 | {0.462778124752844,-0.43813222995353}  |         2
-     10 | {1.08140799724916,-0.917509589902686}  |         2
-     11 | {-0.287536892913973,0.358754809961949} |         2
+ row_id |                row_vec                | matrix_id 
+--------+---------------------------------------+-----------
+      1 | {1.4142135623731}                     |         1
+      2 | {7.40148683087139e-17}                |         1
+      3 | {-1.4142135623731}                    |         1
+      4 | {-3.59290479201926,0.559694003674779} |         2
+      5 | {0.924092949098971,-2.00871628417505} |         2
+      6 | {2.66881184290186,1.44902228049511}   |         2
+(6 rows)
 </pre>
 
--# Create sample data in sparse matrix form:
+-# Now let's look at sparse matrices.  Create sample data in sparse matrix form:
 <pre class="example">
-DROP TABLE IF EXISTS sparse_mat;
-CREATE TABLE sparse_mat (
-                        row_id integer,
-                        col_id integer,
-                        val_id integer
-                        );
-INSERT INTO sparse_mat VALUES
-(1, 2, 4.0),
-(1, 5, 6.0),
-(3, 8, 4.0),
-(5, 4, 2.0),
-(6, 6, 12.0),
-(8, 1, 2.0),
-(8, 7, 2.0),
-(9, 3, 4.0),
-(9, 8, 2.0);
+DROP TABLE IF EXISTS mat_sparse;
+CREATE TABLE mat_sparse (
+    row_id integer,
+    col_id integer,
+    value double precision
+);
+INSERT INTO mat_sparse VALUES
+(1, 1, 1.0),
+(2, 2, 2.0),
+(3, 3, 3.0),
+(4, 4, 4.0),
+(1, 5, 5.0),
+(2, 4, 6.0),
+(3, 2, 7.0),
+(4, 3, 8.0);
 </pre>
--# Run the PCA sparse function for a specified number of principal components:
+As an aside, this is what the sparse matrix above looks like when 
+put in dense form:
 <pre class="example">
-DROP TABLE IF EXISTS result_table, result_table_mean;
-SELECT madlib.pca_sparse_train(
-                    'sparse_mat',
-                    'result_table',
-                    'row_id',
-                    'col_id',
-                    'val_id',
-                    9,
-                    8,
-                    5);
+DROP TABLE IF EXISTS mat_dense;
+SELECT madlib.matrix_densify('mat_sparse', 
+                            'row=row_id, col=col_id, val=value', 
+                            'mat_dense');
+SELECT * FROM mat_dense ORDER BY row_id;
 </pre>
+<pre class="result">
+ row_id |    value    
+--------+-------------
+      1 | {1,0,0,0,5}
+      2 | {0,2,0,6,0}
+      3 | {0,7,3,0,0}
+      4 | {0,0,8,4,0}
+(4 rows)
+</pre>
+
+-# Run the PCA sparse function for a specified number of principal components and view the results:
+<pre class="example">DROP TABLE IF EXISTS result_table, result_table_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse',       -- Source table
+                                'result_table',     -- Output table
+                                'row_id',           -- Row id of source table
+                                'col_id',           -- Column id of source table
+                                'value',            -- Value of matrix at row_id, col_id
+                                4,                  -- Actual number of rows in the matrix
+                                5,                  -- Actual number of columns in the matrix
+                                3);                 -- Number of principal components                            
+SELECT * FROM result_table ORDER BY row_id;
+</pre>
+Result (with principal components truncated for readability):
+<pre class="result">
+ row_id |         principal_components                 |     std_dev      |    proportion     
+--------+----------------------------------------------+------------------+-------------------
+      1 | {-0.0876046030186158,-0.0968983772909994,... | 4.21362803829554 | 0.436590030617467
+      2 | {-0.0647272661608605,0.877639526308692,...   | 3.68408023747461 | 0.333748701544697
+      3 | {-0.0780380267884855,0.177956517174911,...   | 3.05606908060098 | 0.229661267837836
+(3 rows)
+</pre>
+
 -# Project the original sparse data to low-dimensional representation:
 <pre class="example">
-DROP TABLE IF EXISTS sparse_mat_out;
+DROP TABLE IF EXISTS mat_sparse_out;
 SELECT madlib.pca_sparse_project(
-                    'sparse_mat',
+                    'mat_sparse',
                     'result_table',
-                    'sparse_mat_out',
+                    'mat_sparse_out',
                     'row_id',
                     'col_id',
-                    'val_id',
-                    9,
-                    8
+                    'value',
+                    4,
+                    5
                     );
-</pre>
--# Check the error in the projection:
-<pre class="example">
-SELECT * FROM sparse_mat_out ORDER BY row_id;
+SELECT * FROM mat_sparse_out ORDER BY row_id;
 </pre>
 <pre class="result">
- row_id |                                              row_vec
---------+----------------------------------------------------------------------------------------------------
-      1 | {1.34695183995672,0.879135043008323,1.87818383323404,-2.82842712474619,1.24074659599017}
-      2 | {1.63437198343112,-5.32859025290785,-0.240160697276225,-2.36847578586602e-15,0.295172631209684}
-      3 | {1.43909964102331,1.28442554799452,-2.99624380589971,-7.21876262469717e-15,1.06626861836748}
-      4 | {1.21839219844249,0.517445462630873,0.298488396687886,2.88426689939176e-15,-1.34560049142698}
-      5 | {-10.7325138823598,-0.136175420017185,-0.0338708165820332,8.58109879450845e-16,0.0577033753567249}
-      6 | {1.21839219844249,0.517445462630873,0.298488396687885,2.1626219333854e-15,-1.34560049142698}
-      7 | {1.21839219844249,0.517445462630873,0.298488396687885,1.91282175284474e-15,-1.34560049142698}
-      8 | {1.34695183995672,0.879135043008322,1.87818383323403,2.82842712474619,1.24074659599017}
-      9 | {1.30996198263469,0.86973365102307,-1.38155753676828,-2.88889282865906e-15,0.136163657369328}
+ row_id |                         row_vec                         
+--------+---------------------------------------------------------
+      1 | {4.66617015032369,-2.63552220635847,2.1865220849604}
+      2 | {0.228360685652383,-1.21616275892926,-4.46864627611561}
+      3 | {0.672067460100428,5.45249627172823,0.56445525585642}
+      4 | {-5.5665982960765,-1.6008113064405,1.71766893529879}
+(4 rows)
 </pre>
--# Create sample data in sparse matrix form, with a grouping column:
+
+-# Now we use grouping in sparse form to learn different models for different groups.
+First, we create sample data in sparse matrix form with a grouping column:
 <pre class="example">
-DROP TABLE IF EXISTS sparse_mat_with_grouping;
-CREATE TABLE sparse_mat_with_grouping (
+DROP TABLE IF EXISTS mat_sparse_group;
+CREATE TABLE mat_sparse_group (
     row_id integer,
     col_id integer,
-    val_id integer,
-    matrix_id integer
-);
-COPY sparse_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
-1|2|4|1
-1|5|6|1
-3|8|4|1
-5|4|2|1
-6|6|12|1
-8|1|2|1
-8|7|2|1
-9|3|4|1
-9|8|2|1
-1|2|4|2
-1|5|6|2
-6|6|12|2
-8|1|2|2
-8|7|2|2
-\.
+    value double precision,
+    matrix_id integer);
+INSERT INTO mat_sparse_group VALUES
+(1, 1, 1.0, 1),
+(2, 2, 2.0, 1),
+(3, 3, 3.0, 1),
+(4, 4, 4.0, 1),
+(1, 5, 5.0, 1),
+(2, 4, 6.0, 2),
+(3, 2, 7.0, 2),
+(4, 3, 8.0, 2);
 </pre>
--#  Run the PCA sparse function using grouping_cols and capture 90% of the proportion of variance:
+
+-#  Run the PCA function with grouping for a specified proportion of variance
+and view the results:
 <pre class="example">
-DROP TABLE IF EXISTS result_table_grouped, result_table_grouped_mean;
-SELECT madlib.pca_sparse_train (
-        'sparse_mat_with_grouping',
-        'result_table_grouped',
-        'row_id',
-        'col_id',
-        'val_id',
-        10,
-        10,
-        0.9,
-        'matrix_id'
-    );
+DROP TABLE IF EXISTS result_table_group, result_table_group_mean;
+SELECT madlib.pca_sparse_train( 'mat_sparse_group',       -- Source table
+                                'result_table_group',     -- Output table
+                                'row_id',           -- Row id of source table
+                                'col_id',           -- Column id of source table
+                                'value',            -- Value of matrix at row_id, col_id
+                                4,                 -- Actual number of rows in the matrix
+                                5,                 -- Actual number of columns in the matrix
+                                0.8,                 -- Proportion of variance
+                                'matrix_id');
+SELECT * FROM result_table_group ORDER BY matrix_id, row_id;
 </pre>
--# Project the original sparse data to low-dimensional representation, based on independent PCA models
-learnt for each group:
+Result (with principal components truncated for readability):
+<pre class="result">
+ row_id |           principal_components             |     std_dev      |    proportion     | matrix_id 
+--------+--------------------------------------------+------------------+-------------------+-----------
+      1 | {-0.17805696611353,0.0681313257646983,...  | 2.73659933165925 | 0.544652792875481 |         1
+      2 | {-0.0492086814863993,0.149371585357526,... | 2.06058314533194 | 0.308800210823714 |         1
+      1 | {0,-0.479486114660443,...                  | 4.40325305087975 | 0.520500333693473 |         2
+      2 | {0,0.689230898585949,...                   |  3.7435566458567 | 0.376220573442628 |         2
+(4 rows)
+</pre>
+
+-# Projection in sparse format with grouping:
 <pre class="example">
-DROP TABLE IF EXISTS sparse_mat_with_grouping_projected;
+DROP TABLE IF EXISTS mat_sparse_group_projected;
 SELECT madlib.pca_sparse_project(
-    'sparse_mat_with_grouping',
-    'result_table_grouped',
-    'sparse_mat_with_grouping_projected',
+    'mat_sparse_group',
+    'result_table_group',
+    'mat_sparse_group_projected',
     'row_id',
     'col_id',
-    'val_id',
-    10,
-    10
+    'value',
+    4,
+    5
     );
-</pre>
--# Check the error in the projection:
-<pre class="example">
-SELECT * FROM sparse_mat_with_grouping_projected ORDER BY matrix_id,row_id;
+SELECT * FROM mat_sparse_group_projected ORDER BY matrix_id, row_id;
 </pre>
 <pre class="result">
- row_id |                                     row_vec                                     | matrix_id
---------+---------------------------------------------------------------------------------+-----------
-      1 | {1.22692546452761,0.818988440128327,1.81591522940788,2.82842712474619}          |         1
-      2 | {1.4852284082451,-5.38921614355968,-0.238413962586454,-2.76195922299053e-16}    |         1
-      3 | {1.30995313897398,1.19175698765945,-3.05860526896832,-2.63021567919958e-15}     |         1
-      4 | {1.11099454391467,0.484270184421277,0.294314185921038,-1.61822162787753e-16}    |         1
-      5 | {-10.8525988405155,-0.128517253667187,-0.0335370457586047,8.53109060561845e-16} |         1
-      6 | {1.11099454391467,0.484270184421277,0.294314185921038,2.9799754824927e-16}      |         1
-      7 | {1.11099454391467,0.484270184421277,0.294314185921038,-8.08358590285308e-16}    |         1
-      8 | {1.22692546452761,0.818988440128326,1.81591522940789,-2.82842712474619}         |         1
-      9 | {1.19351195715706,0.808981987902128,-1.41427400274461,-1.38815367040019e-15}    |         1
-     10 | {1.07707077534015,0.42620698814481,0.230057263479109,-1.46091661600792e-16}     |         1
-      1 | {-1.28753936783518,4.42689855953263}                                            |         2
-      2 | {-1.56033687316063,0.102219682546935}                                           |         2
-      3 | {10.8262428615875,-0.605455042777997}                                           |         2
-      4 | {-1.16530001926412,-0.290932942633583}                                          |         2
-      5 | {-1.16530001926412,-0.605455042777997}                                          |         2
-      6 | {-1.12955331641269,-0.605455042777998}                                          |         2
-      7 | {-1.12955331641269,-0.605455042777998}                                          |         2
-      8 | {-1.12955331641269,-0.605455042777998}                                          |         2
-      9 | {-1.12955331641269,-0.605455042777998}                                          |         2
-     10 | {-1.12955331641269,-0.605455042777998}                                          |         2
+ row_id |                 row_vec                 | matrix_id 
+--------+-----------------------------------------+-----------
+      1 | {-4.00039298524261,-0.626820612715982}  |         1
+      2 | {0.765350785238575,0.951348276645455}   |         1
+      3 | {1.04951017256904,2.22388180170356}     |         1
+      4 | {2.185532027435,-2.54840946563303}      |         1
+      1 | {-0.627846810195469,-0.685031603549092} |         2
+      2 | {-1.64754249747757,-4.7662114622896}    |         2
+      3 | {-3.98424961281857,4.13958468655255}    |         2
+      4 | {6.25963892049161,1.31165837928614}     |         2
+(8 rows)
 </pre>
 
 @anchor notes
@@ -482,12 +482,14 @@ containing the principal components. If this table is not found by the MADlib
 projection function, it will trigger an error.  As long the principal component
 tables are created with MADlib functions, then the column-means table will be
 automatically found by the MADlib projection functions.
+
 - Because of the centering step in PCA projection
 (see "Technical Background"), sparse matrices almost always
 become dense during the projection
 process.  Thus, this implementation automatically densifies sparse matrix input,
 and there should be no expected performance improvement in using sparse matrix
 input over dense matrix input.
+
 - Table names can be optionally schema qualified (current_schemas() is
 searched if a schema name is not provided) and all table and column names
 should follow case-sensitivity and quoting rules per the database.
@@ -495,6 +497,10 @@ should follow case-sensitivity and quoting rules per the database.
 If mixed-case or multi-byte characters are desired for entity names then the
 string should be double-quoted; in this case the input would be '"MyTable"').
 
+- If the input table for pca_project (pca_sparse_project) contains grouping columns,
+the same grouping columns must be used in the training function used to generate the
+principal components too.
+
 
 @anchor background_project
 @par Technical Background

[2/6] incubator-madlib git commit: Drop a temp table explicitly.

Posted by nj...@apache.org.

Drop a temp table explicitly.

One of source table copies made was not dropped, which was causing
it to stay alive after the function exits. Dropping it now.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/4e1e96a2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/4e1e96a2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/4e1e96a2

Branch: refs/heads/master
Commit: 4e1e96a25b818e6881297ee708683a7b952f92c2
Parents: edb69dd
Author: Nandish Jayaram <nj...@users.noreply.github.com>
Authored: Tue Jan 10 14:09:58 2017 -0800
Committer: Nandish Jayaram <nj...@users.noreply.github.com>
Committed: Wed Jan 18 14:05:09 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/pca/pca.py_in | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4e1e96a2/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index 71146d5..b6609f2 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -199,7 +199,7 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
         col_dim = 0
     while True:
         if result_summary_table:
-            result_summary_table_temp = result_summary_table + unique_string() + "_" + str(grp_id)
+            result_summary_table_temp = "pg_temp." + result_summary_table + unique_string() + "_" + str(grp_id)
         if grouping_cols:
             grp_value_dict = distinct_grouping_values[grp_id]
             where_conditions = ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
@@ -309,7 +309,7 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
         # grouping, and/or, the input table had perfect values for the row_id column.
         # This temp table will ensure pca works even when the value of row_id column
         # in dense matrix format does not have values ranging from 1 to number of rows.
-        source_table_grouped = unique_string() + "group_" + str(grp_id)
+        source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
         plpy.execute("""
                     CREATE TABLE {source_table_grouped} AS
                     SELECT {temp_table_columns}
@@ -325,6 +325,7 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
         schema_madlib, source_table_grouped, source_table_copy, row_id)
 
     if(created_new_table):
+        plpy.execute("DROP TABLE {0}".format(source_table_grouped))
         source_table_grouped = source_table_copy
     [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',

[4/6] incubator-madlib git commit: PCA: Add grouping support to PCA

Posted by nj...@apache.org.

PCA: Add grouping support to PCA

JIRA: MADLIB-947

- PCA can now handle grouping columns. pca_train() with grouping_cols
parameter specified learns an independent model for each group in
the input table. New columns corresponding to the columns specified
in grouping_cols will be created in the output, mean and summary
tables.
- If pca_project() is called on an input table that has grouping_cols
in it, the pc_table used in the parameter list must be a PCA model
table that is learnt with grouping_cols. If the input table for
pca_project() has grouping columns but the pc_table used does not
support grouping_cols, or vice versa, there will be an error thrown.
- Another important new feature is that the 'row_id' column in the
input tables always had to be serially increasing, starting from 1. That
requirement is now relaxed since this commit converts given 'row_id' to
a new column that follows the rules laid out by sparse and dense
matrix formats.
- Both the online and user docs are improved with more examples.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8bd46ae8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8bd46ae8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8bd46ae8

Branch: refs/heads/master
Commit: 8bd46ae80be9f0c630638ece9b5c8ddf7152a85c
Parents: e0439ed
Author: Nandish Jayaram <nj...@users.noreply.github.com>
Authored: Wed Dec 21 14:18:38 2016 -0800
Committer: Nandish Jayaram <nj...@users.noreply.github.com>
Committed: Wed Jan 18 14:05:09 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/pca/pca.py_in        | 807 ++++++++++------
 src/ports/postgres/modules/pca/pca.sql_in       | 169 +++-
 .../postgres/modules/pca/pca_project.py_in      | 932 ++++++++++++++-----
 .../postgres/modules/pca/pca_project.sql_in     | 242 ++++-
 src/ports/postgres/modules/pca/test/pca.sql_in  | 110 ++-
 .../modules/pca/test/pca_project.sql_in         | 128 ++-
 6 files changed, 1824 insertions(+), 564 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index 327dfd7..9a13b2f 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -16,12 +16,13 @@ from linalg.svd import _svd_upper_wrap
 from utilities.utilities import _array_to_string
 from utilities.utilities import add_postfix
 from utilities.utilities import __mad_version
-from utilities.utilities import unique_string
+from utilities.utilities import unique_string, split_quoted_delimited_str
 from utilities.utilities import _assert
+from utilities.validate_args import get_cols, get_cols_and_types
+from utilities.control import MinWarning
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
 
-
 import time
 import plpy
 
@@ -29,16 +30,43 @@ version_wrapper = __mad_version()
 string_to_array = version_wrapper.select_vecfunc()
 array_to_string = version_wrapper.select_vec_return()
 
+
+def pca_sparse(schema_madlib, source_table, pc_table, row_id,
+               col_id, val_id, row_dim, col_dim, k, grouping_cols,
+               lanczos_iter, use_correlation, result_summary_table,
+               variance, **kwargs):
+    """
+    Args:
+        @param schema_madlib
+        @param source_table
+        @param pc_table
+        @param row_id
+        @param col_id
+        @param val_id
+        @param row_dim
+        @param col_dim
+        @param k
+        @param grouping_cols
+        @param lanczos_iter
+        @param use_correlation
+        @param result_summary_table
+        @param variance
+
+    Returns:
+        None
+
+    """
+    pca_wrap(schema_madlib, source_table, pc_table, row_id,
+        k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, variance, True, col_id,
+        val_id, row_dim, col_dim)
+# ------------------------------------------------------------------------
+
 # ========================================================================
 def pca(schema_madlib, source_table, pc_table, row_id,
         k, grouping_cols, lanczos_iter, use_correlation,
         result_summary_table, variance, **kwargs):
     """
-    Compute the PCA of the matrix in source_table.
-
-    This function is the specific call for dense matrices and creates three
-    tables corresponding to the three decomposition matrices.
-
     Args:
         @param schema_madlib
         @param source_table
@@ -55,7 +83,23 @@ def pca(schema_madlib, source_table, pc_table, row_id,
         None
 
     """
-    startTime = time.time()  # measure the starting time
+    pca_wrap(schema_madlib, source_table, pc_table, row_id,
+        k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, variance)
+# ------------------------------------------------------------------------
+
+
+def pca_wrap(schema_madlib, source_table, pc_table, row_id,
+        k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, variance, is_sparse=False, col_id=None,
+        val_id=None, row_dim=None, col_dim=None, **kwargs):
+    """
+    This wrapper was added to support grouping columns. This
+    function does the necessary pre-processing for handling
+    grouping_cols, if set. It then constructs a single query
+    that includes a separate "madlib._pca_union(...)" for each
+    group.
+    """
     # Reset the message level to avoid random messages
     old_msg_level = plpy.execute("""
                                   SELECT setting
@@ -63,25 +107,232 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')
+    grouping_cols_list = []
+    _validate_args_output_table(pc_table, result_summary_table)
+    if(grouping_cols):
+        # validate the grouping columns. We currently only support grouping_cols
+        # to be column names in the source_table, and not expressions!
+        grouping_cols_list = split_quoted_delimited_str(grouping_cols)
+        _assert(columns_exist_in_table(source_table, grouping_cols_list, schema_madlib),
+                "PCA error: One or more grouping columns in {0} do not exist!".format(grouping_cols))
+        distinct_grouping_values = plpy.execute("""
+                SELECT DISTINCT {grouping_cols} FROM {source_table}
+            """.format(grouping_cols=grouping_cols, source_table=source_table))
+    else:
+        grouping_cols = ''
+    other_columns_in_table = [col for col in get_cols(source_table) if col not in grouping_cols_list]
+    grouping_cols_clause = ''
+    if grouping_cols_list:
+        cols_names_types = get_cols_and_types(source_table)
+        grouping_cols_clause = ', ' + ', '.join([c_name+" "+c_type for (c_name, c_type) in cols_names_types if c_name in grouping_cols_list])
+    ## Create all output tables
+    plpy.execute("""
+        CREATE TABLE {pc_table} (
+            row_id               INTEGER,
+            principal_components double precision[],
+            std_dev              double precision,
+            proportion           double precision
+            {grouping_cols_clause}
+        )
+        """.format(pc_table=pc_table, grouping_cols_clause=grouping_cols_clause))
+    pc_table_mean = add_postfix(pc_table, "_mean")
+    plpy.execute("""
+        DROP TABLE IF EXISTS {pc_table_mean};
+        CREATE TABLE {pc_table_mean} (
+            column_mean     double precision[]
+            {grouping_cols_clause}
+        )
+        """.format(pc_table_mean=pc_table_mean, grouping_cols_clause=grouping_cols_clause))
+    if result_summary_table:
+        plpy.execute("""
+                DROP TABLE IF EXISTS {0};
+                CREATE TABLE {0} (
+                rows_used               INTEGER,
+                "exec_time (ms)"        numeric,
+                iter                    INTEGER,
+                recon_error             double precision,
+                relative_recon_error    double precision,
+                use_correlation         boolean
+                {1}
+                )
+            """.format(result_summary_table, grouping_cols_clause))
+    else:
+        result_summary_table = ''
+
+    # declare variables whose values will be different for each group, if
+    # grouping_cols is specified
+    grouping_where_clause = ''
+    select_grouping_cols = ''
+    temp_table_columns = ''
+    result_summary_table_temp = ''
+    other_columns_in_table.remove(row_id)
+    temp_table_columns = """ ROW_NUMBER() OVER({partition_over}) AS row_id,
+                    """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)) +\
+                    ','.join(other_columns_in_table)
+    # We can now ignore the original row_id for all computations since we will
+    # create a new table with a row_id column that has perfect serially increasing
+    # row_id value. This is to support the scenario where users are not forced
+    # to have a row_id that follows a particular format. This restriction of having to
+    # provide a serially increasing row_id value starting from 1 becomes a pain
+    # point when grouping is used, since the row_id for each group will then have
+    # to start from 1.
+    row_id = 'row_id'
+    pca_union_call_list = []
+    grp_id = 0
+    if not is_sparse:
+        col_id = 'NULL'
+        val_id = 'NULL'
+        row_dim = 0
+        col_dim = 0
+    while True:
+        if result_summary_table:
+            result_summary_table_temp = result_summary_table + unique_string() + "_" + str(grp_id)
+        if grouping_cols:
+            grp_value_dict = distinct_grouping_values[grp_id]
+            grouping_where_clause = ' WHERE ' + ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            select_grouping_cols = ', ' + ', '.join([str(value)+" AS "+key for (key, value) in grp_value_dict.items()])
+
+        pca_union_call_list.append("""
+            {schema_madlib}._pca_union('{source_table}', '{pc_table}', '{pc_table_mean}', '{row_id}',
+                {k}, '{grouping_cols}', {lanczos_iter}, {use_correlation},
+                '{result_summary_table}', '{result_summary_table_temp}', {variance},
+                {grp_id}, '{grouping_where_clause}', '{select_grouping_cols}',
+                '{temp_table_columns}', {is_sparse}, '{col_id}', '{val_id}',
+                {row_dim}, {col_dim})
+            """.format(schema_madlib=schema_madlib,
+                source_table=source_table, pc_table=pc_table,
+                pc_table_mean=pc_table_mean, row_id=row_id,
+                k='NULL' if k is None else k, grouping_cols=grouping_cols,
+                lanczos_iter=lanczos_iter, use_correlation=use_correlation,
+                result_summary_table=result_summary_table,
+                result_summary_table_temp=result_summary_table_temp,
+                variance='NULL' if variance==None else variance,
+                grp_id=grp_id, grouping_where_clause=grouping_where_clause,
+                select_grouping_cols=select_grouping_cols,
+                temp_table_columns=temp_table_columns, is_sparse=is_sparse,
+                col_id=col_id, val_id=val_id, row_dim=row_dim, col_dim=col_dim))
+        grp_id += 1
+        if not grouping_cols_list or len(distinct_grouping_values) == grp_id:
+            break
+    # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
+    # <query_i> in parallel.
+    pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
+    try:
+        plpy.execute(pca_union_call)
+    except Exception as e:
+        ## drop the output tables that were created if PCA errored out.
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            DROP TABLE IF EXISTS {1};
+            """.format(pc_table, pc_table_mean))
+        if result_summary_table:
+            plpy.execute("""
+                DROP TABLE IF EXISTS {0};
+                """.format(result_summary_table))
+        plpy.error(str(e) + "\n" + str(e.args) + "\n" + str(e.strerror))
+        raise
+
+    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+
+
+def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
+        row_id, k, grouping_cols, lanczos_iter, use_correlation,
+        result_summary_table, result_summary_table_temp, variance,
+        grp_id, grouping_where_clause, select_grouping_cols,
+        temp_table_columns, is_sparse, col_id, val_id, row_dim,
+        col_dim, **kwargs):
+    """
+    This function does all the heavy lifting of PCA, for both pca and pca_sparse.
+    Compute the PCA of the matrix in source_table. This function is the specific
+    call for dense matrices and creates three tables corresponding to the three
+    decomposition matrices.
 
+    Args:
+        @param source_table          TEXT,    -- Source table name (dense matrix)
+        @param pc_table              TEXT,    -- Output table name for the principal components
+        @param pc_table_mean         TEXT,    -- Output table name for the principal components
+        @param row_id                TEXT,    -- Column name for the ID for each row
+        @param k                     INTEGER, -- Number of principal components to compute
+        @param grouping_cols         TEXT,    -- Comma-separated list of grouping columns (Default: NULL)
+        @param lanczos_iter          INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension))
+        @param use_correlation       BOOLEAN, -- If True correlation matrix is used for principal components (Default: False)
+        @param result_summary_table  TEXT,    -- Table name to store summary of results (Default: NULL)
+        @param result_summary_table_temp  TEXT,    -- Table name to store summary of results (Default: NULL)
+        @param variance              DOUBLE PRECISION,   -- The proportion of variance (Default: NULL)
+        @param grp_id                INTEGER, -- a place holder id for each group
+        @param grouping_where_clause TEXT,    -- WHERE clause using grouping_cols
+        @param select_grouping_cols  TEXT,    -- SELECT clause using grouping_cols
+        @param temp_table_columns    TEXT,    -- SELECT caluse for creating temporary copy of the source_table
+        @param is_sparse             BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
+        @param col_id                TEXT,    -- sparse representation based detail
+        @param val_id                TEXT,    -- sparse representation based detail
+        @param row_dim               INTEGER, -- sparse representation based detail
+        @param col_dim               INTEGER  -- sparse representation based detail
+
+    Returns:
+        None
+    """
+    ## Creation of this temp table is unnecessary if the scenario does not involve
+    ## grouping, and/or, the input table had perfect values for the row_id column.
+    ## This temp table will ensure pca works even when row_id of the source_table
+    ## does not have serially increasing numbers starting from 1;
+    source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+    plpy.execute("""
+            CREATE TEMP TABLE {source_table_grouped} AS
+            SELECT {temp_table_columns}
+            FROM {source_table}
+            {grouping_where_clause}
+        """.format(source_table_grouped=source_table_grouped,
+            source_table=source_table, grouping_where_clause=grouping_where_clause,
+            temp_table_columns=temp_table_columns))
+    startTime = time.time()  # measure the starting time
     # Step 1: Validate the input arguments
-    _validate_args(schema_madlib, source_table, pc_table, k,
-                   row_id, None, None, None, None,
-                   grouping_cols, lanczos_iter, use_correlation,
-                   result_summary_table,variance)
+    if is_sparse:
+        _validate_args(schema_madlib, source_table_grouped, k, row_id, col_id,
+               val_id, row_dim, col_dim, lanczos_iter,
+               use_correlation, variance)
+        # Step 1.1: Densify the matrix for sparse input tables
+        # We densify the matrix because the recentering process will generate a
+        # dense matrix, so we just wrap around regular PCA.
+        # First we must copy the sparse matrix and add in the dimension information
+        sparse_temp = "pg_temp." + unique_string() + "_sparse"
+
+        # Add in the dimension information needed by the densifying process
+        create_temp_sparse_matrix_table_with_dims(source_table_grouped, sparse_temp,
+                                                  row_id, col_id, val_id,
+                                                  row_dim, col_dim)
+
+        x_dense = "pg_temp." + unique_string() + "_dense"
+        plpy.execute("""
+            SELECT {schema_madlib}.matrix_densify(
+                '{sparse_temp}',
+                'row={row_id}, col={col_id}, val={val_id}',
+                '{x_dense}', 'row=row_id, val=row_vec')
+            """.format(schema_madlib=schema_madlib,
+                sparse_temp=sparse_temp, row_id=row_id,
+                col_id=col_id, val_id=val_id, x_dense=x_dense))
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            """.format(sparse_temp))
+        source_table_grouped = x_dense
+        row_id = 'row_id'
+    else:
+        _validate_args(schema_madlib, source_table_grouped, k,
+                       row_id, None, None, None, None,
+                       lanczos_iter, use_correlation, variance)
 
     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string() + "_reformated_names"
     created_new_table = cast_dense_input_table_to_correct_columns(
-        schema_madlib, source_table, source_table_copy, row_id)
+        schema_madlib, source_table_grouped, source_table_copy, row_id)
 
     if(created_new_table):
-        source_table = source_table_copy
+        source_table_grouped = source_table_copy
 
-    [row_dim, col_dim] = get_dims(source_table,
+    [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
-    validate_dense(source_table,
+    validate_dense(source_table_grouped,
                    {'row': 'row_id', 'val': 'row_vec'},
                    check_col=False, row_dim=row_dim)
     if k:
@@ -102,13 +353,13 @@ def pca(schema_madlib, source_table, pc_table, row_id,
             curK = lanczos_iter
 
     # Note: we currently don't support grouping columns or correlation matrices
-    if grouping_cols is None and not use_correlation:
+    if not use_correlation:
 
         # Step 2: Normalize the data (Column means)
         dimension = col_dim
         scaled_source_table = "pg_temp." + unique_string() + "_scaled_table"
         column_mean_str = _recenter_data(schema_madlib,
-                                         source_table,
+                                         source_table_grouped,
                                          scaled_source_table,
                                          'row_id',
                                          'row_vec',
@@ -116,19 +367,19 @@ def pca(schema_madlib, source_table, pc_table, row_id,
         # Step 3: Create temporary output & result summary table
         svd_output_temp_table = "pg_temp."+ unique_string()+ "_svd_out_tbl"
 
-        if result_summary_table is None:
+        if result_summary_table_temp is None:
             result_summary_table_string = ''
         else:
-            result_summary_table_string = ", '{0}'".format(result_summary_table)
+            result_summary_table_string = ", '{0}'".format(result_summary_table_temp)
 
         # Step 4: Perform SVD
         # Step 4.1: Perform upper part of SVD
-        if result_summary_table:
+        if result_summary_table_temp:
             t0 = time.time()
 
         (source_table_svd,bd_pref) = _svd_upper_wrap(schema_madlib,
             scaled_source_table, svd_output_temp_table,
-            row_id, curK, lanczos_iter, result_summary_table)
+            row_id, curK, lanczos_iter, result_summary_table_temp)
 
         # Calculate the sum of values for proportion
         svd_var_s = add_postfix(svd_output_temp_table, "_s")
@@ -195,9 +446,9 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                 svd_output_temp_table, row_id, curK, lanczos_iter, bd_pref)
 
         # Step 4.4: Create the SVD result table
-        if result_summary_table:
+        if result_summary_table_temp:
             t1 = time.time()
-            [row_dim, col_dim] = get_dims(source_table,
+            [row_dim, col_dim] = get_dims(source_table_grouped,
                 {'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'})
             arguments = {'schema_madlib': schema_madlib,
                          'source_table': scaled_source_table,
@@ -206,7 +457,7 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                          'matrix_s': add_postfix(tmp_matrix_table, "_s"),
                          'row_dim': row_dim,
                          'col_dim': col_dim,
-                         'result_summary_table': result_summary_table,
+                         'result_summary_table': result_summary_table_temp,
                          'temp_prefix': "pg_temp." + unique_string(),
                          't0': t0, 't1': t1}
             create_summary_table(**arguments)
@@ -229,11 +480,12 @@ def pca(schema_madlib, source_table, pc_table, row_id,
         # Step 6: Insert the output of SVD into the PCA table
         plpy.execute(
             """
-            CREATE TABLE {pc_table} AS
+            INSERT INTO {pc_table}
             SELECT  {svd_v_transpose}.row_id,
                     row_vec AS principal_components,
                     value / sqrt({row_dim} - 1) AS std_dev,
                     ((value*value)/ {eigen_sum}) AS proportion
+                    {select_grouping_cols}
             FROM {svd_v_transpose},
                  {svd_output_temp_table_s}
             WHERE ({svd_v_transpose}.row_id = {svd_output_temp_table_s}.row_id)
@@ -244,30 +496,24 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                        svd_v_transpose=svd_v_transpose,
                        pc_table=pc_table,
                        row_dim=row_dim,
-                       eigen_sum=eigen_sum))
+                       eigen_sum=eigen_sum,
+                       select_grouping_cols=select_grouping_cols))
         # Output the column mean
-        pc_table_mean = add_postfix(pc_table, "_mean")
         plpy.execute(
             """
-            DROP TABLE IF EXISTS {pc_table_mean};
-            CREATE TABLE {pc_table_mean} AS
+            INSERT INTO {pc_table_mean}
             SELECT '{column_mean_str}'::FLOAT8[] AS column_mean
+            {select_grouping_cols}
             """.format(pc_table_mean=pc_table_mean,
-                       column_mean_str=column_mean_str))
+                       column_mean_str=column_mean_str,
+                       select_grouping_cols=select_grouping_cols))
         # Step 7: Append to the SVD summary table to get the PCA summary table
-        if result_summary_table:
+        if result_summary_table_temp:
             stopTime = time.time()
             dt = (stopTime - startTime) * 1000.
-            summary_table_tmp_name = unique_string()
-            plpy.execute(
-                """
-                ALTER TABLE {result_summary_table}
-                RENAME TO {tmp_name};
-                """.format(result_summary_table=result_summary_table,
-                           tmp_name=summary_table_tmp_name))
             plpy.execute(
                 """
-                CREATE TABLE {result_summary_table} AS
+                INSERT INTO {result_summary_table}
                 SELECT
                     rows_used,
                     {dt} AS "exec_time (ms)",
@@ -275,13 +521,15 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                     recon_error,
                     relative_recon_error,
                     {use_correlation} AS use_correlation
-                FROM {tmp_name};
+                    {select_grouping_cols}
+                FROM {result_summary_table_temp};
                 """.format(result_summary_table=result_summary_table,
                            dt=str(dt), iter=curK,
                            use_correlation=bool(use_correlation),
-                           tmp_name=summary_table_tmp_name))
-            plpy.execute("DROP TABLE {tmp_name};".format(
-                tmp_name=summary_table_tmp_name))
+                           result_summary_table_temp=result_summary_table_temp,
+                           select_grouping_cols=select_grouping_cols))
+            plpy.execute("DROP TABLE {result_summary_table_temp};".format(
+                result_summary_table_temp=result_summary_table_temp))
 
         # Step 8: Output handling & cleanup
         plpy.execute(
@@ -294,6 +542,7 @@ def pca(schema_madlib, source_table, pc_table, row_id,
             DROP TABLE IF EXISTS {svd_output_temp_table_u};
             DROP TABLE IF EXISTS {svd_output_temp_table_v};
             DROP TABLE IF EXISTS {scaled_source_table};
+            DROP TABLE IF EXISTS {source_table_grouped};
             """.format(svd_output_temp_table=svd_output_temp_table,
                        svd_output_temp_table_s=svd_output_temp_table_s,
                        svd_output_temp_table_u=svd_output_temp_table_u,
@@ -301,9 +550,8 @@ def pca(schema_madlib, source_table, pc_table, row_id,
                        scaled_source_table=scaled_source_table,
                        svd_v_transpose=svd_v_transpose,
                        source_table_copy=source_table_copy,
-                       tmp_matrix_s_table=tmp_matrix_s_table))
-
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+                       tmp_matrix_s_table=tmp_matrix_s_table,
+                       source_table_grouped=source_table_grouped))
 # ------------------------------------------------------------------------
 
 # ------------------------------------------------------------------------
@@ -311,24 +559,20 @@ def pca(schema_madlib, source_table, pc_table, row_id,
 # ------------------------------------------------------------------------
 def _validate_args(schema_madlib,
                    source_table,
-                   pc_table,
                    k,
                    row_id,
                    col_id=None,
                    val_id=None,
                    row_dim=None,
                    col_dim=None,
-                   grouping_cols=None,
                    lanczos_iter=0,
                    use_correlation=False,
-                   result_summary_table=None,
                    variance=None):
     """
     Validates all arguments passed to the PCA function
     Args:
         @param schema_madlib    Name of the schema where MADlib is installed
         @param source_table     Name of the source table
-        @param output_table     Name of the output table
         @param k                Number of singular vectors to return
         @param row_id           Name of the row_id column
         @param col_id           Name of the col_id column
@@ -336,7 +580,6 @@ def _validate_args(schema_madlib,
         @param grouping_cols    The columns that the data should be grouped by
         @param lanczos_iter     The number of lanczos iterations to use in the SVD calculation
         @param use_correlation  If the correlation matrix should be used instead of the covariance matrix
-        @param result_summary_table  Name of summary table
         @param variance         Proportion of variance
 
     Returns:
@@ -360,23 +603,11 @@ def _validate_args(schema_madlib,
         if (variance <= 0) or (variance >1):
             plpy.error("""PCA error: components_param must be either
                 a positive integer or a float in the range (0.0,1.0]!""")
-    # confirm output tables are valid
-    if pc_table:
-        _assert(not table_exists(pc_table, only_first_schema=True) and
-                not table_exists(pc_table + '_mean', only_first_schema=True),
-                "PCA error: Output table {pc_table}/{pc_table}_mean "
-                "already exist!".format(pc_table=pc_table))
-    else:
-        plpy.error("PCA error: Invalid output table prefix!")
 
     _assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
             "PCA error: {1} column does not exist in {0}!".
             format(source_table, "NULL" if row_id is None else row_id))
 
-    if(grouping_cols):
-        plpy.error("PCA error: Grouping columns are not currently supported!\
-        This value must be set to NULL")
-
     if (lanczos_iter < 0):
         plpy.error("PCA error: lanczos_iter can't be negative! (Use zero for \
         default value)  The provided value is {0}".format(str(lanczos_iter)))
@@ -415,15 +646,27 @@ def _validate_args(schema_madlib,
     if use_correlation:
         plpy.error("PCA error: Using the correlation matrix is not enabled! \
         This value must be set to FALSE")
+# ========================================================================
 
+def _validate_args_output_table(pc_table, result_summary_table=None):
+    """
+        confirm output tables are valid
+        @param pc_table     Name of the output table
+        @param result_summary_table  Name of summary table
+    """
+    if pc_table:
+        _assert(not table_exists(pc_table, only_first_schema=True) and
+                not table_exists(pc_table + '_mean', only_first_schema=True),
+                "PCA error: Output table {pc_table}/{pc_table}_mean "
+                "already exist!".format(pc_table=pc_table))
+    else:
+        plpy.error("PCA error: Invalid output table prefix!")
     if result_summary_table:
         if not result_summary_table.strip():
             plpy.error("PCA error: Invalid result summary table name!")
         _assert(not table_exists(result_summary_table, only_first_schema=True),
                 "PCA error: Result summary table {0} \
                         already exists!".format(result_summary_table))
-# ========================================================================
-
 
 def _recenter_data(schema_madlib, source_table, output_table, row_id,
                    col_name, dimension):
@@ -476,123 +719,7 @@ def _recenter_data(schema_madlib, source_table, output_table, row_id,
     return x_mean_str
 # ------------------------------------------------------------------------
 
-
-def pca_sparse(schema_madlib,
-               source_table,
-               pc_table,
-               row_id,
-               col_id,
-               val_id,
-               row_dim,
-               col_dim,
-               k,
-               grouping_cols,
-               lanczos_iter,
-               use_correlation,
-               result_summary_table,
-               variance,
-               **kwargs):
-    """
-    Compute the PCA of a sparse matrix in source_table.
-
-    This function is the specific call for dense matrices and creates three
-    tables corresponding to the three decomposition matrices.
-
-    Args:
-        @param schema_madlib
-        @param source_table
-        @param pc_table
-        @param row_id
-        @param col_id
-        @param val_id
-        @param row_dim
-        @param col_dim
-        @param k
-        @param grouping_cols
-        @param lanczos_iter
-        @param use_correlation
-        @param result_summary_table
-        @param variance
-
-    Returns:
-        None
-
-    """
-    startTime = time.time()
-    # Reset the message level to avoid random messages
-    old_msg_level = plpy.execute("""
-                                  SELECT setting
-                                  FROM pg_settings
-                                  WHERE name='client_min_messages'
-                                  """)[0]['setting']
-    plpy.execute('SET client_min_messages TO warning')
-
-    # Step 1: Validate the input arguments
-    _validate_args(schema_madlib, source_table, pc_table, k, row_id, col_id,
-                   val_id, row_dim, col_dim, grouping_cols, lanczos_iter,
-                   use_correlation, result_summary_table, variance)
-
-    # Step 2: Densify the matrix
-    #  We densify the matrix because the recentering process will generate a
-    # dense matrix, so we just wrap around regular PCA.
-    # First we must copy the sparse matrix and add in the dimension information
-
-    sparse_temp = "pg_temp." + unique_string() + "_sparse"
-
-    # Add in the dimension information need by the densifying process
-    create_temp_sparse_matrix_table_with_dims(source_table, sparse_temp,
-                                              row_id, col_id, val_id,
-                                              row_dim, col_dim)
-
-    x_dense = "pg_temp." + unique_string() + "_dense"
-    plpy.execute("""
-        SELECT {schema_madlib}.matrix_densify(
-            '{sparse_temp}',
-            'row={row_id}, col={col_id}, val={val_id}',
-            '{x_dense}', 'row=row_id, val=row_vec')
-        """.format(**locals()))
-
-    # Step 3: Pass the densified matrix to regular PCA
-    pca(schema_madlib, x_dense, pc_table, 'row_id',
-        k, grouping_cols, lanczos_iter, use_correlation,
-        result_summary_table, variance)
-
-    # Step 4: Clean up
-    plpy.execute("""
-        DROP TABLE IF EXISTS {x_dense};
-        DROP TABLE IF EXISTS {sparse_temp};
-        """.format(x_dense=x_dense, sparse_temp=sparse_temp))
-
-    if result_summary_table:
-        stopTime = time.time()
-        dt = (stopTime - startTime) * 1000.
-        summary_table_tmp_name = unique_string()
-        plpy.execute(
-            """
-            ALTER TABLE {result_summary_table}
-            RENAME TO {tmp_name};
-            """.format(result_summary_table=result_summary_table,
-                       tmp_name=summary_table_tmp_name))
-        plpy.execute(
-            """
-            CREATE TABLE {result_summary_table} AS
-            SELECT
-                rows_used,
-                {dt} AS "exec_time (ms)",
-                iter,
-                recon_error,
-                relative_recon_error,
-                use_correlation
-            FROM {tmp_name};
-            """.format(result_summary_table=result_summary_table,
-                       dt=str(dt), tmp_name=summary_table_tmp_name))
-        plpy.execute("DROP TABLE {tmp_name};".format(
-            tmp_name=summary_table_tmp_name))
-
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
-# ------------------------------------------------------------------------
-
-
+# Sparse PCA train help function
 def pca_sparse_help_message(schema_madlib, message=None, **kwargs):
     """
     Given a help string, provide usage information
@@ -607,58 +734,66 @@ def pca_sparse_help_message(schema_madlib, message=None, **kwargs):
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
-        -----------------------------------------------------------------------
-                                    USAGE
-        -----------------------------------------------------------------------
-        SELECT {schema_madlib}.pca_sparse_train(
-            source_table        -- TEXT,    Name of data table
-            pc_table            -- TEXT,    Name of the table containing the principle components
-            row_id              -- TEXT,    Column name for the row coordinates.
-            col_id              -- TEXT,    Column name for the column coordinates.
-            val_id              -- TEXT,    Column name for the sparse values.
-            row_dim,            -- INTEGER, The number of rows in the sparse matrix
-            col_dim,            -- INTEGER, The number of columns in the sparse matrix
-            components_param    -- INTEGER OR FLOAT, The parameter to control the number of principal components to calculate from the input data.
-            [
-            grouping_cols       -- TEXT,    Comma-separated list of grouping columns
-                                            (Default: NULL)
-            lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
-                                            (Default: minimum of of the smallest input
-                                                matrix dimension and k+40)
-            use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
-                                            (Default: False)
-            rslt_summary_table  -- TEXT,    Table name to store summary of results
-                                            (Default: NULL)
-            ]
-        );
-        If components_param is INTEGER it is used for denoting the number of principal components to compute.
-        If components_param is FLOAT it is used as the target proportion of variance.
-        -------------------------------------------------------------------------
-                                OUTPUT TABLES
-        -------------------------------------------------------------------------
-        The output table ("pc_table" above) has the following columns:
-            row_id              -- INTEGER, The ranking of the eigenvalues
-            prin_comp           -- FLOAT[], The principal components
-            eigen_values        -- FLOAT[]  The eigenvalues associated with each principal component
-
-        A secondary output table named "pc_table"_mean is also generated.
-        This table has only the single column:
-            column_mean         -- FLOAT[], The column means of the input data
-
-        -------------------------------------------------------------------------
-                            RESULT SUMMARY TABLE
-        -------------------------------------------------------------------------
-        The result summary table ("rslt_summary_table" above) has the following columns
-            rows_used              -- INTEGER,  Number of rows used in the PCA calculation
-            exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
-            use_correlation        -- BOOLEAN,  Value of parameter use_correlation
-            iter                   -- INTEGER,  Number of iterations the SVD took to converge
-            recon_error            -- FLOAT,    Absolute error in the approximation
-            relative_recon_error   -- FLOAT     Relative error in the approximation
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.pca_sparse_train(
+    source_table        -- TEXT,    Name of data table
+    pc_table            -- TEXT,    Name of the table containing the principal components
+    row_id              -- TEXT,    Column name for the row coordinates.
+    col_id              -- TEXT,    Column name for the column coordinates.
+    val_id              -- TEXT,    Column name for the sparse values.
+    row_dim,            -- INTEGER, The number of rows in the sparse matrix
+    col_dim,            -- INTEGER, The number of columns in the sparse matrix
+    components_param    -- INTEGER OR FLOAT, The parameter to control the number of
+                                    principal components to calculate from the input data.
+    grouping_cols       -- TEXT,    Comma-separated list of grouping columns
+                                    (Default: NULL)
+    lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
+                                    (Default: minimum of of the smallest input
+                                        matrix dimension and k+40)
+    use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
+                                    (Default: False)
+    rslt_summary_table  -- TEXT,    Table name to store summary of results
+                                    (Default: NULL)
+);
+If components_param is INTEGER it is used for denoting the number of principal components to compute.
+If components_param is FLOAT it is used as the target proportion of variance.
+-------------------------------------------------------------------------
+                        OUTPUT TABLES
+-------------------------------------------------------------------------
+A PCA model is created for each group, if grouping_cols is specified.
+The output table ("pc_table" above) has the following columns:
+    row_id              -- INTEGER, The ranking of the eigenvalues
+    prin_comp           -- FLOAT[], The principal components
+    eigen_values        -- FLOAT[]  The eigenvalues associated with each principal component
+    grouping_cols       -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
+
+A secondary output table named "pc_table"_mean is also generated.
+This table has only the single column:
+    column_mean         -- FLOAT[], The column means of the input data
+
+-------------------------------------------------------------------------
+                    RESULT SUMMARY TABLE
+-------------------------------------------------------------------------
+The result summary table ("rslt_summary_table" above) has the following columns
+    rows_used              -- INTEGER,  Number of rows used in the PCA calculation
+    exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
+    use_correlation        -- BOOLEAN,  Value of parameter use_correlation
+    iter                   -- INTEGER,  Number of iterations the SVD took to converge
+    recon_error            -- FLOAT,    Absolute error in the approximation
+    relative_recon_error   -- FLOAT     Relative error in the approximation
+    grouping_cols          -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
         """.format(schema_madlib=schema_madlib)
     else:
-        if message.lower() in ("example", "examples"):
+        if message is not None and \
+                message.lower() in ("example", "examples"):
             return """
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
 DROP TABLE IF EXISTS sparse_mat;
 CREATE TABLE sparse_mat (
     row_id integer,
@@ -676,13 +811,46 @@ COPY sparse_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
 9|3|4
 9|8|2
 \.
-DROP TABLE IF EXISTS result_table;
-DROP TABLE IF EXISTS result_table_mean;
-SELECT pca_sparse_train('sparse_mat', 'result_table',
+
+DROP TABLE IF EXISTS result_table_sparse;
+DROP TABLE IF EXISTS result_table_sparse_mean;
+SELECT {schema_madlib}.pca_sparse_train('sparse_mat', 'result_table_sparse',
 'row_id', 'col_id', 'val_id', 10, 10, 10);
-            """
+
+DROP TABLE IF EXISTS sparse_mat_with_grouping;
+CREATE TABLE sparse_mat_with_grouping (
+    row_id integer,
+    col_id integer,
+    val_id integer,
+    matrix_id integer
+);
+COPY sparse_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|1
+8|1|2|1
+8|7|2|1
+9|3|4|1
+9|8|2|1
+1|2|4|2
+1|5|6|2
+6|6|12|2
+8|1|2|2
+8|7|2|2
+\.
+
+DROP TABLE IF EXISTS result_table_sparsed_grouped;
+DROP TABLE IF EXISTS result_table_sparsed_grouped_mean;
+SELECT {schema_madlib}.pca_sparse_train('sparse_mat_with_grouping', 'result_table_sparsed_grouped',
+'row_id', 'col_id', 'val_id', 10, 10, 10, 'matrix_id');
+            """.format(schema_madlib=schema_madlib)
         else:
             return """
+----------------------------------------------------------------
+         Summary: Sparse PCA Training
+----------------------------------------------------------------
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
@@ -692,8 +860,10 @@ accounts for as much of the variability in the data as possible), and each
 succeeding component in turn has the highest variance possible under the
 constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
 components.
-
-For an overview on usage, run: SELECT {schema_madlib}.pca_sparse_train('usage');
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.pca_sparse_train('usage');
+--
         """.format(schema_madlib=schema_madlib)
 
 
@@ -711,75 +881,122 @@ def pca_help_message(schema_madlib, message=None, **kwargs):
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
-        -----------------------------------------------------------------------
-                                    USAGE
-        -----------------------------------------------------------------------
-        SELECT {schema_madlib}.pca_train(
-            source_table        -- TEXT,    Name of data table
-            pc_table            -- TEXT,    Name of the table containing the principle components
-            row_id              -- TEXT,    Column name for the row coordinates.
-            components_param    -- INTEGER OR FLOAT, The parameter to control the number of principal components to calculate from the input data.
-            [
-            grouping_cols       -- TEXT,    Comma-separated list of grouping columns
-                                            (Default: NULL)
-            lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
-                                            (Default: minimum of of the smallest input
-                                                matrix dimension and k+40)
-            use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
-                                            (Default: False)
-            rslt_summary_table  -- TEXT,    Table name to store summary of results
-                                            (Default: NULL)
-            variance            -- DOUBLE PRECISION, Proportion of variance
-                                            (Default: NULL)
-            ]
-        );
-        If components_param is INTEGER it is used for denoting the number of principal components to compute.
-        If components_param is FLOAT it is used as the target proportion of variance.
-        -------------------------------------------------------------------------
-                                OUTPUT TABLES
-        -------------------------------------------------------------------------
-        The output table ("pc_table" above) has the following columns:
-            row_id              -- INTEGER, The ranking of the eigenvalues
-            prin_comp           -- FLOAT[], The principal components
-            eigen_values        -- FLOAT[]  The eigenvalues associated with each principal component
-
-        A secondary output table named "pc_table"_mean is also generated.
-        This table has only the single column:
-            column_mean         -- FLOAT[], The column means of the input data
-        -------------------------------------------------------------------------
-                            RESULT SUMMARY TABLE
-        -------------------------------------------------------------------------
-        The result summary table ("rslt_summary_table" above) has the following columns
-            rows_used              -- INTEGER,  Number of rows used in the PCA calculation
-            exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
-            use_correlation        -- BOOLEAN,  Value of parameter use_correlation
-            iter                   -- INTEGER,  Number of iterations the SVD took to converge
-            recon_error            -- FLOAT,    Absolute error in the approximation
-            relative_recon_error   -- FLOAT     Relative error in the approximation
+-----------------------------------------------------------------------
+                            USAGE
+-----------------------------------------------------------------------
+SELECT {schema_madlib}.pca_train(
+    source_table        -- TEXT,    Name of data table
+    pc_table            -- TEXT,    Name of the table containing the principal components
+    row_id              -- TEXT,    Column name for the row coordinates.
+    components_param    -- INTEGER OR FLOAT, The parameter to control the number of
+                                             principal components to calculate from
+                                             the input data.
+
+    grouping_cols       -- TEXT,    Comma-separated list of grouping column names
+                                    (Default: NULL)
+    lanczos_iter        -- INTEGER, The number of Lanczos iterations to use in the SVD calculation
+                                    (Default: minimum of of the smallest input
+                                        matrix dimension and k+40)
+    use_correlation     -- BOOLEAN, If True correlation matrix is used for principal components
+                                    (Default: False)
+    rslt_summary_table  -- TEXT,    Table name to store summary of results
+                                    (Default: NULL)
+    variance            -- DOUBLE PRECISION, Proportion of variance
+                                    (Default: NULL)
+);
+If components_param is INTEGER it is used for denoting the number of
+principal components to compute. If components_param is FLOAT it is used
+as the target proportion of variance.
+-------------------------------------------------------------------------
+                        OUTPUT TABLES
+-------------------------------------------------------------------------
+A PCA model is created for each group, if grouping_cols is specified.
+The output table ("pc_table" above) has the following columns:
+    row_id              -- INTEGER, The ranking of the eigenvalues
+    prin_comp           -- FLOAT[], The principal components
+    eigen_values        -- FLOAT[], The eigenvalues associated with each
+                                    principal component
+    grouping_cols       -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
+
+A secondary output table named "pc_table"_mean is also generated.
+This table has only the single column:
+    column_mean         -- FLOAT[], The column means of the input data
+    grouping_cols       -- The grouping columns (with their types), if any,
+                           specified in grouping_cols
+-------------------------------------------------------------------------
+                    RESULT SUMMARY TABLE
+-------------------------------------------------------------------------
+The result summary table ("rslt_summary_table" above) has the following columns
+    rows_used              -- INTEGER,  Number of rows used in the PCA calculation
+    exec_time              -- FLOAT,    Number of milliseconds the PCA calculation took
+    use_correlation        -- BOOLEAN,  Value of parameter use_correlation
+    iter                   -- INTEGER,  Number of iterations the SVD took to converge
+    recon_error            -- FLOAT,    Absolute error in the approximation
+    relative_recon_error   -- FLOAT     Relative error in the approximation
+    grouping_cols          -- The grouping columns (with their types), if any,
+                              specified in grouping_cols
         """.format(schema_madlib=schema_madlib)
     else:
-        if message.lower() in ("example", "examples"):
+        if message is not None and \
+                message.lower() in ("example", "examples"):
             return """
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
 DROP TABLE IF EXISTS mat;
 CREATE TABLE mat (
     row_id integer,
     row_vec double precision[]
 );
 COPY mat (row_id, row_vec) FROM stdin DELIMITER '|';
-1|{1,2,3}
-2|{2,1,2}
-3|{3,2,1}
+1|{{1,2,3}}
+2|{{2,1,2}}
+3|{{3,2,1}}
 \.
+
 DROP TABLE IF EXISTS result_table;
 DROP TABLE IF EXISTS result_table_mean;
-SELECT pca_train( 'mat',
-                  'result_table',
-                  'row_id',
-                  3
+SELECT {schema_madlib}.pca_train( 'mat',
+          'result_table',
+          'row_id',
+          3
     );
-            """
+
+DROP TABLE IF EXISTS mat_with_grouping;
+CREATE TABLE mat_with_grouping (
+    row_id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+COPY mat_with_grouping (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
+1|{{1,2,3}}|1
+2|{{2,1,2}}|1
+3|{{3,2,1}}|1
+4|{{1,2,3,4,5}}|2
+5|{{2,1,2,4,5}}|2
+6|{{3,2,1,4,5}}|2
+7|{{1,2,4,4,5}}|2
+8|{{1,3,3,4,5}}|2
+9|{{2,2,2,4,5}}|2
+10|{{3,3,2,4,5}}|2
+11|{{2,2,3,4,5}}|2
+\.
+
+DROP TABLE IF EXISTS result_table_grp;
+DROP TABLE IF EXISTS result_table_grp_mean;
+SELECT {schema_madlib}.pca_train( 'mat_with_grouping',
+          'result_table_grp',
+          'row_id',
+          0.9,
+          'matrix_id'
+    );
+            """.format(schema_madlib=schema_madlib)
         else:
             return """
+----------------------------------------------------------------
+         Summary: PCA Training
+----------------------------------------------------------------
 Principal component analysis (PCA) is a mathematical procedure that uses an
 orthogonal transformation to convert a set of observations of possibly
 correlated variables into a set of values of linearly uncorrelated variables
@@ -789,6 +1006,8 @@ accounts for as much of the variability in the data as possible), and each
 succeeding component in turn has the highest variance possible under the
 constraint that it be orthogonal to (i.e., uncorrelated with) the preceding
 components.
-
-For an overview on usage, run: SELECT {schema_madlib}.pca_train('usage');
+--
+For an overview on usage, run:
+SELECT {schema_madlib}.pca_train('usage');
+--
             """.format(schema_madlib=schema_madlib)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index 9f573f3..3edf304 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -210,18 +210,13 @@ component, but 'components_param' = 1.0 (FLOAT) will return all
 principal components, i.e., proportion of variance of 100%.
 
 <DT>grouping_cols (optional)</DT>
-<DD>TEXT, default: NULL.  
-
-@note <em>Not currently implemented. Any non-NULL value is ignored.
-   Grouping support will be added in a future release. </em> The parameter 
-   is planned to be implemented as a 
-   comma-separated list of column names, with the source data grouped using 
-   the combination of all the columns. An independent PCA model will be 
-   computed for each combination of the grouping columns.</DD>
+<DD>TEXT, default: NULL. A comma-separated list of column names, with the
+source data grouped using the combination of all the columns. An independent
+PCA model will be computed for each combination of the grouping columns.</DD>
 
 <DT>lanczos_iter (optional)</DT>
 <DD>INTEGER, default: minimum of {<em>k+40</em>, smallest matrix dimension}
-where <em>k</em> is the number of principle components specified in the 
+where <em>k</em> is the number of principal components specified in the 
 parameter 'components_param'.  This parameter defines the 
 number of Lanczos iterations for the SVD calculation.
 The Lanczos iteration number roughly corresponds to the accuracy of the SVD
@@ -300,7 +295,7 @@ INSERT INTO mat VALUES
 (3, '{3,2,1}');
 </pre>
 
--# Run the PCA function for a specified number of principle components and view the results:
+-# Run the PCA function for a specified number of principal components and view the results:
 <pre class="example">
 DROP TABLE IF EXISTS result_table, result_table_mean;
 SELECT madlib.pca_train( 'mat',
@@ -333,6 +328,49 @@ SELECT * FROM result_table;
       2 | {2.22044604925031e-16,-1,1.11022302462516e-16}               | 0.577350269189626 | 0.142857142857041
 </pre>
 
+-# An example of using grouping_cols to learn different models for each group.
+Create sample data in dense matrix form:
+<pre class="example">
+DROP TABLE IF EXISTS mat_with_grouping;
+CREATE TABLE mat_with_grouping (
+    row_id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+COPY mat_with_grouping (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
+1|{1,2,3}|1
+2|{2,1,2}|1
+3|{3,2,1}|1
+4|{1,2,3,4,5}|2
+5|{2,1,2,4,5}|2
+6|{3,2,1,4,5}|2
+7|{1,2,4,4,5}|2
+8|{1,3,3,4,5}|2
+9|{2,2,2,4,5}|2
+10|{3,3,2,4,5}|2
+11|{2,2,3,4,5}|2
+\.
+
+-# Run the PCA function for a specified number of principal components and view the results:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_grp, result_table_grp_mean;
+SELECT {schema_madlib}.pca_train( 'mat_with_grouping',
+          'result_table_grp',
+          'row_id',
+          0.9,
+          'matrix_id'
+    );
+SELECT * FROM result_table_grp;
+</pre>
+<pre class="result">
+ row_id |                                         principal_components                                         |      std_dev      |    proportion     | matrix_id
+--------+------------------------------------------------------------------------------------------------------+-------------------+-------------------+-----------
+      1 | {-0.707106781186547,1.7650811368064e-16,0.707106781186547}                                           |   1.4142135623731 | 0.857142857142245 |         1
+      2 | {1.38777878078145e-17,1,4.16333634234434e-17}                                                        | 0.577350269189625 | 0.142857142857041 |         1
+      1 | {0.6597973999259,-0.0411675274295868,-0.750315017666817,-8.77661658627016e-17,-1.98408997564847e-16} |  1.19430528584267 | 0.726149513493974 |         2
+      2 | {-0.608369791899382,0.114069335981134,0.608369791899382,0.0380149274618831,0.495293291381477}        |  1.03132841282174 | 0.541488586593004 |         2
+</pre>
+
 -# Create sample data in sparse matrix form:
 <pre class="example">
 DROP TABLE IF EXISTS sparse_mat;
@@ -357,8 +395,8 @@ INSERT INTO sparse_mat VALUES
 <pre class="example">
 DROP TABLE IF EXISTS dense_mat;
 SELECT madlib.matrix_densify(
-							'sparse_mat', 
-							'row=row_id, col=col_id, val=val_id', 
+							'sparse_mat',
+							'row=row_id, col=col_id, val=val_id',
 							'dense_mat');
 SELECT * FROM dense_mat order by row_id;
 </pre>
@@ -376,24 +414,24 @@ SELECT * FROM dense_mat order by row_id;
       9 | {0,0,4,0,0,0,0,2}
 </pre>
 
--# Run the PCA sparse function for a specified number of principle components and view the results:
+-# Run the PCA sparse function for a specified number of principal components and view the results:
 <pre class="example">
 DROP TABLE IF EXISTS result_table, result_table_mean;
 SELECT madlib.pca_sparse_train(
-                                'sparse_mat', 
+                                'sparse_mat',
                                 'result_table',
-                                'row_id', 
-                                'col_id', 
-                                'val_id', 
-                                9, 
-                                8, 
+                                'row_id',
+                                'col_id',
+                                'val_id',
+                                9,
+                                8,
                                 5);
 SELECT * FROM result_table;
 </pre>
-Result (with principle components truncated for readability):
+Result (with principal components truncated for readability):
 <pre class="result">
-  row_id |      principal_components                   |      std_dev      |     proportion     
---------+----------------------------------------------------------------------------------------
+ row_id |           principal_components              |      std_dev      |     proportion     
+--------+---------------------------------------------+-------------------+----------------------
       1 | {0.0189854059340971,0.0593979357345431,\u2026    |  4.03069474374092 |  0.604208682045711
       2 | {0.0346801706473592,-0.536234300404824,\u2026    |  2.42282285507368 |  0.218308410262949
       3 | {0.166190350977087,-0.112693750915351,\u2026     |  1.54680674776235 | 0.0889814051004931
@@ -401,6 +439,63 @@ Result (with principle components truncated for readability):
       5 | {0.645363366217337,0.0403370697192613,\u2026     | 0.906957663197704 | 0.0305915282045503
 </pre>
 
+-# Create sample data in sparse matrix form, with a grouping column:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat_with_grouping;
+CREATE TABLE sparse_mat_with_grouping (
+    row_id integer,
+    col_id integer,
+    val_id integer,
+    matrix_id integer
+);
+COPY sparse_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|1
+8|1|2|1
+8|7|2|1
+9|3|4|1
+9|8|2|1
+1|2|4|2
+1|5|6|2
+6|6|12|2
+8|1|2|2
+8|7|2|2
+\.
+</pre>
+
+-#  Run the PCA sparse function for a specified number of principal components and view the results:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_grouped, result_table_grouped_mean;
+SELECT {schema_madlib}.pca_sparse_train (
+        'sparse_mat_with_grouping',
+        'result_table_grouped',
+        'row_id',
+        'col_id',
+        'val_id',
+        10,
+        10,
+        0.9,
+        'matrix_id'
+    );
+
+SELECT * FROM result_table_grouped;
+</pre>
+Result (with principal components truncated for readability):
+<pre class="result">
+ row_id |         principal_components                 |     std_dev      |     proportion     | matrix_id
+--------+----------------------------------------------+------------------+--------------------+-----------
+      1 | {0.0169618842872617,0.0374636722968646,...   | 3.81515913912922 |  0.600912629871244 |         1
+      2 | {-0.0290315981382334,-0.0981953629958789,... |   1.925316518345 |  0.153034831478482 |         1
+      3 | {-0.0321284612209643,-0.396464491482199,...  | 1.42677134361186 | 0.0840416890026057 |         1
+      4 | {-4.61214190235381e-16,0.707106781186545,... | 1.33333333333333 | 0.0733944954128442 |         1
+      1 | {0.0178733514257156,0.0394965128556239,...   | 3.80635791249958 |  0.710213752845497 |         2
+      2 | {0.0417974843144143,0.144805266892335,...    | 1.89702728276027 |  0.176407476055726 |         2
+      3 | {-1.3150650111236,7.0774402964125e-17,...    | 1.69008042534288 |  0.140018227653292 |         2
+</pre>
+
 @anchor notes
 @par Notes
 
@@ -478,8 +573,6 @@ File pca.sql_in documenting the SQL functions
 
 \ref grp_pca_project
 
-
-
 */
 
 -- -----------------------------------------------------------------------
@@ -913,3 +1006,31 @@ BEGIN
 END;
 $$ LANGUAGE plpgsql IMMUTABLE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_union(
+    source_table          TEXT,    -- Source table name (dense matrix)
+    pc_table              TEXT,    -- Output table name for the principal components
+    pc_table_mean         TEXT,    -- Output table name for the principal components
+    row_id                TEXT,    -- Column name for the ID for each row
+    k                     INTEGER, -- Number of principal components to compute
+    grouping_cols         TEXT,    -- Comma-separated list of grouping columns (Default: NULL)
+    lanczos_iter          INTEGER, -- The number of Lanczos iterations for the SVD calculation (Default: min(k+40, smallest Matrix dimension))
+    use_correlation       BOOLEAN, -- If True correlation matrix is used for principal components (Default: False)
+    result_summary_table  TEXT,    -- Table name to store summary of results (Default: NULL)
+    result_summary_table_temp  TEXT,    -- Table name to store summary of results (Default: NULL)
+    variance              DOUBLE PRECISION,   -- The proportion of variance (Default: NULL)
+    grp_id                INTEGER, -- a place holder id for each group
+    grouping_where_clause TEXT,    -- WHERE clause using grouping_cols
+    select_grouping_cols  TEXT,    -- SELECT clause using grouping_cols
+    temp_table_columns    TEXT,    -- SELECT caluse for creating temporary copy of the source_table
+    is_sparse             BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
+    col_id                TEXT,    -- sparse representation based detail
+    val_id                TEXT,    -- sparse representation based detail
+    row_dim               INTEGER, -- sparse representation based detail
+    col_dim               INTEGER  -- sparse representation based detail
+)
+RETURNS VOID AS $$
+PythonFunction(pca, pca, _pca_union)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

[6/6] incubator-madlib git commit: Bug fixes and minor changes

Posted by nj...@apache.org.

Bug fixes and minor changes

* Validate parameters earlier than what was done.
* There was an issue with pca_project on greenplum, due to
the usage of row_number() multiple times (while trying to
create a mapping table between original row_id and the
serially increasing row id introduced in the code). Changes
made to use row_number only once now.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/edb69dd4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/edb69dd4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/edb69dd4

Branch: refs/heads/master
Commit: edb69dd4126125f0a6ecc7eaf4149022da157442
Parents: 8bd46ae
Author: Nandish Jayaram <nj...@users.noreply.github.com>
Authored: Thu Jan 5 09:48:02 2017 -0800
Committer: Nandish Jayaram <nj...@users.noreply.github.com>
Committed: Wed Jan 18 14:05:09 2017 -0800

----------------------------------------------------------------------
 .../postgres/modules/linalg/matrix_ops.py_in    |   9 +-
 src/ports/postgres/modules/pca/pca.py_in        | 154 +++++------
 src/ports/postgres/modules/pca/pca.sql_in       |   1 +
 .../postgres/modules/pca/pca_project.py_in      | 257 +++++++++----------
 .../postgres/modules/pca/pca_project.sql_in     |   1 +
 src/ports/postgres/modules/pca/test/pca.sql_in  |  42 ++-
 6 files changed, 219 insertions(+), 245 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/edb69dd4/src/ports/postgres/modules/linalg/matrix_ops.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/linalg/matrix_ops.py_in b/src/ports/postgres/modules/linalg/matrix_ops.py_in
index 51ae7e3..9f3215c 100644
--- a/src/ports/postgres/modules/linalg/matrix_ops.py_in
+++ b/src/ports/postgres/modules/linalg/matrix_ops.py_in
@@ -86,7 +86,8 @@ def _matrix_column_to_array_format(source_table, row_id, output_table,
 def create_temp_sparse_matrix_table_with_dims(source_table,
                                               out_table,
                                               row_id, col_id, value,
-                                              row_dim, col_dim):
+                                              row_dim, col_dim,
+                                              sparse_where_condition=None):
     """
     Make a copy of the input sparse table and add (row_dim, col_dim, NULL) to it
 
@@ -102,6 +103,8 @@ def create_temp_sparse_matrix_table_with_dims(source_table,
     Returns:
         None
     """
+    if not sparse_where_condition:
+        sparse_where_condition = ''
     plpy.execute("""
                  CREATE TABLE {out_table} as
                      SELECT
@@ -110,11 +113,13 @@ def create_temp_sparse_matrix_table_with_dims(source_table,
                          {value}
                      FROM {source_table}
                      WHERE {value} is not NULL
+                     {sparse_where_condition}
                  """.format(row_id=row_id,
                             col_id=col_id,
                             value=value,
                             source_table=source_table,
-                            out_table=out_table))
+                            out_table=out_table,
+                            sparse_where_condition=sparse_where_condition))
     res_row_dim, res_col_dim = get_dims(out_table, {'row': row_id,
                                                     'col': col_id,
                                                     'val': value})

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/edb69dd4/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index 9a13b2f..71146d5 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -108,7 +108,15 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')
     grouping_cols_list = []
-    _validate_args_output_table(pc_table, result_summary_table)
+    if is_sparse:
+        _validate_args(schema_madlib, source_table, pc_table, k, row_id, col_id,
+                   val_id, row_dim, col_dim, lanczos_iter,
+                   use_correlation, result_summary_table, variance)
+    else:
+        _validate_args(schema_madlib, source_table, pc_table, k,
+                   row_id, None, None, None, None,
+                   lanczos_iter, use_correlation,
+                   result_summary_table,variance)
     if(grouping_cols):
         # validate the grouping columns. We currently only support grouping_cols
         # to be column names in the source_table, and not expressions!
@@ -162,21 +170,26 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
     # declare variables whose values will be different for each group, if
     # grouping_cols is specified
     grouping_where_clause = ''
+    sparse_where_condition = ''
     select_grouping_cols = ''
     temp_table_columns = ''
     result_summary_table_temp = ''
-    other_columns_in_table.remove(row_id)
-    temp_table_columns = """ ROW_NUMBER() OVER({partition_over}) AS row_id,
-                    """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)) +\
-                    ','.join(other_columns_in_table)
+    # For Dense matrix format only:
     # We can now ignore the original row_id for all computations since we will
-    # create a new table with a row_id column that has perfect serially increasing
-    # row_id value. This is to support the scenario where users are not forced
-    # to have a row_id that follows a particular format. This restriction of having to
-    # provide a serially increasing row_id value starting from 1 becomes a pain
-    # point when grouping is used, since the row_id for each group will then have
-    # to start from 1.
-    row_id = 'row_id'
+    # create a new table with a row_id column that has not duplicates and ranges
+    # from 1 to number of rows in the group/table. This is to mainly support the
+    # grouping scneario where the row_id values might not range between 1 and
+    # number of rows in the group, for each group. Doing this also just extends
+    # this behavior for non-grouping scenarios too. If creating a new temp table
+    # that corrects the row_id column is not of much importance in non-grouping
+    # cases, we can avoid creating the temp table and save some computation time.
+    # But, at the moment, the code creates the temp table even for the non-grouping
+    # scenario.
+    # We don't need to do this for sparse representation because of the nature
+    # of its definition.
+    other_columns_in_table.remove(row_id)
+    temp_table_columns = """ ROW_NUMBER() OVER() AS row_id, """ + ','.join(other_columns_in_table)
+
     pca_union_call_list = []
     grp_id = 0
     if not is_sparse:
@@ -189,16 +202,18 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
             result_summary_table_temp = result_summary_table + unique_string() + "_" + str(grp_id)
         if grouping_cols:
             grp_value_dict = distinct_grouping_values[grp_id]
-            grouping_where_clause = ' WHERE ' + ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            where_conditions = ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            sparse_where_condition = ' AND ' + where_conditions
+            grouping_where_clause = ' WHERE ' + where_conditions
             select_grouping_cols = ', ' + ', '.join([str(value)+" AS "+key for (key, value) in grp_value_dict.items()])
 
         pca_union_call_list.append("""
             {schema_madlib}._pca_union('{source_table}', '{pc_table}', '{pc_table_mean}', '{row_id}',
                 {k}, '{grouping_cols}', {lanczos_iter}, {use_correlation},
                 '{result_summary_table}', '{result_summary_table_temp}', {variance},
-                {grp_id}, '{grouping_where_clause}', '{select_grouping_cols}',
-                '{temp_table_columns}', {is_sparse}, '{col_id}', '{val_id}',
-                {row_dim}, {col_dim})
+                {grp_id}, '{grouping_where_clause}', '{sparse_where_condition}',
+                '{select_grouping_cols}', '{temp_table_columns}', {is_sparse},
+                '{col_id}', '{val_id}', {row_dim}, {col_dim})
             """.format(schema_madlib=schema_madlib,
                 source_table=source_table, pc_table=pc_table,
                 pc_table_mean=pc_table_mean, row_id=row_id,
@@ -208,6 +223,7 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
                 result_summary_table_temp=result_summary_table_temp,
                 variance='NULL' if variance==None else variance,
                 grp_id=grp_id, grouping_where_clause=grouping_where_clause,
+                sparse_where_condition=sparse_where_condition,
                 select_grouping_cols=select_grouping_cols,
                 temp_table_columns=temp_table_columns, is_sparse=is_sparse,
                 col_id=col_id, val_id=val_id, row_dim=row_dim, col_dim=col_dim))
@@ -217,20 +233,7 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
     # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
     # <query_i> in parallel.
     pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
-    try:
-        plpy.execute(pca_union_call)
-    except Exception as e:
-        ## drop the output tables that were created if PCA errored out.
-        plpy.execute("""
-            DROP TABLE IF EXISTS {0};
-            DROP TABLE IF EXISTS {1};
-            """.format(pc_table, pc_table_mean))
-        if result_summary_table:
-            plpy.execute("""
-                DROP TABLE IF EXISTS {0};
-                """.format(result_summary_table))
-        plpy.error(str(e) + "\n" + str(e.args) + "\n" + str(e.strerror))
-        raise
+    plpy.execute(pca_union_call)
 
     plpy.execute("SET client_min_messages TO %s" % old_msg_level)
 
@@ -238,9 +241,9 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
 def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
         row_id, k, grouping_cols, lanczos_iter, use_correlation,
         result_summary_table, result_summary_table_temp, variance,
-        grp_id, grouping_where_clause, select_grouping_cols,
-        temp_table_columns, is_sparse, col_id, val_id, row_dim,
-        col_dim, **kwargs):
+        grp_id, grouping_where_clause, sparse_where_condition,
+        select_grouping_cols, temp_table_columns, is_sparse, col_id,
+        val_id, row_dim, col_dim, **kwargs):
     """
     This function does all the heavy lifting of PCA, for both pca and pca_sparse.
     Compute the PCA of the matrix in source_table. This function is the specific
@@ -272,36 +275,22 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
     Returns:
         None
     """
-    ## Creation of this temp table is unnecessary if the scenario does not involve
-    ## grouping, and/or, the input table had perfect values for the row_id column.
-    ## This temp table will ensure pca works even when row_id of the source_table
-    ## does not have serially increasing numbers starting from 1;
-    source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
-    plpy.execute("""
-            CREATE TEMP TABLE {source_table_grouped} AS
-            SELECT {temp_table_columns}
-            FROM {source_table}
-            {grouping_where_clause}
-        """.format(source_table_grouped=source_table_grouped,
-            source_table=source_table, grouping_where_clause=grouping_where_clause,
-            temp_table_columns=temp_table_columns))
     startTime = time.time()  # measure the starting time
-    # Step 1: Validate the input arguments
+    # Step 1: Modify data format for sparse input
     if is_sparse:
-        _validate_args(schema_madlib, source_table_grouped, k, row_id, col_id,
-               val_id, row_dim, col_dim, lanczos_iter,
-               use_correlation, variance)
         # Step 1.1: Densify the matrix for sparse input tables
         # We densify the matrix because the recentering process will generate a
         # dense matrix, so we just wrap around regular PCA.
         # First we must copy the sparse matrix and add in the dimension information
         sparse_temp = "pg_temp." + unique_string() + "_sparse"
-
         # Add in the dimension information needed by the densifying process
-        create_temp_sparse_matrix_table_with_dims(source_table_grouped, sparse_temp,
+        create_temp_sparse_matrix_table_with_dims(source_table, sparse_temp,
                                                   row_id, col_id, val_id,
-                                                  row_dim, col_dim)
-
+                                                  row_dim, col_dim, sparse_where_condition)
+        validate_sparse(sparse_temp,
+                        {'row': row_id, 'col': col_id, 'val': val_id},
+                        check_col=False)
+        # Step 1.2: Densify the input matrix
         x_dense = "pg_temp." + unique_string() + "_dense"
         plpy.execute("""
             SELECT {schema_madlib}.matrix_densify(
@@ -315,12 +304,21 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
             DROP TABLE IF EXISTS {0};
             """.format(sparse_temp))
         source_table_grouped = x_dense
-        row_id = 'row_id'
     else:
-        _validate_args(schema_madlib, source_table_grouped, k,
-                       row_id, None, None, None, None,
-                       lanczos_iter, use_correlation, variance)
-
+        # Creation of this temp table is unnecessary if the scenario does not involve
+        # grouping, and/or, the input table had perfect values for the row_id column.
+        # This temp table will ensure pca works even when the value of row_id column
+        # in dense matrix format does not have values ranging from 1 to number of rows.
+        source_table_grouped = unique_string() + "group_" + str(grp_id)
+        plpy.execute("""
+                    CREATE TABLE {source_table_grouped} AS
+                    SELECT {temp_table_columns}
+                    FROM {source_table}
+                    {grouping_where_clause}
+                """.format(source_table_grouped=source_table_grouped,
+                    source_table=source_table, grouping_where_clause=grouping_where_clause,
+                    temp_table_columns=temp_table_columns))
+    row_id = 'row_id'
     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string() + "_reformated_names"
     created_new_table = cast_dense_input_table_to_correct_columns(
@@ -328,7 +326,6 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
 
     if(created_new_table):
         source_table_grouped = source_table_copy
-
     [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
@@ -351,10 +348,8 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
     else:
         if variance: #lanczos_iter overrides the proportion default for k
             curK = lanczos_iter
-
     # Note: we currently don't support grouping columns or correlation matrices
     if not use_correlation:
-
         # Step 2: Normalize the data (Column means)
         dimension = col_dim
         scaled_source_table = "pg_temp." + unique_string() + "_scaled_table"
@@ -371,7 +366,6 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
             result_summary_table_string = ''
         else:
             result_summary_table_string = ", '{0}'".format(result_summary_table_temp)
-
         # Step 4: Perform SVD
         # Step 4.1: Perform upper part of SVD
         if result_summary_table_temp:
@@ -392,7 +386,6 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
             )
             FROM {scaled_source_table}
             """.format(**locals()))[0]['array_sum']
-
         # Step 4.2: Adjust the k value
         if variance:
             variance_tmp_table = "pg_temp."+ unique_string()+ "_var_tmp"
@@ -416,7 +409,6 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
             plpy.execute("""
                 DROP TABLE IF EXISTS {variance_tmp_table}
                 """.format(variance_tmp_table=variance_tmp_table))
-
         # Step 4.3: Perform the lower part of SVD
         tmp_matrix_table = "temp_"+ unique_string()+ "_matrix"
         tmp_matrix_s_table = add_postfix(tmp_matrix_table, "_s")
@@ -444,7 +436,6 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
             tmp_matrix_table = svd_output_temp_table
             _svd_lower_wrap(schema_madlib, source_table_svd,
                 svd_output_temp_table, row_id, curK, lanczos_iter, bd_pref)
-
         # Step 4.4: Create the SVD result table
         if result_summary_table_temp:
             t1 = time.time()
@@ -559,6 +550,7 @@ def _pca_union(schema_madlib, source_table, pc_table, pc_table_mean,
 # ------------------------------------------------------------------------
 def _validate_args(schema_madlib,
                    source_table,
+                   pc_table,
                    k,
                    row_id,
                    col_id=None,
@@ -567,12 +559,14 @@ def _validate_args(schema_madlib,
                    col_dim=None,
                    lanczos_iter=0,
                    use_correlation=False,
+                   result_summary_table=None,
                    variance=None):
     """
     Validates all arguments passed to the PCA function
     Args:
         @param schema_madlib    Name of the schema where MADlib is installed
         @param source_table     Name of the source table
+        @param output_table     Name of the output table
         @param k                Number of singular vectors to return
         @param row_id           Name of the row_id column
         @param col_id           Name of the col_id column
@@ -580,6 +574,7 @@ def _validate_args(schema_madlib,
         @param grouping_cols    The columns that the data should be grouped by
         @param lanczos_iter     The number of lanczos iterations to use in the SVD calculation
         @param use_correlation  If the correlation matrix should be used instead of the covariance matrix
+        @param result_summary_table  Name of summary table
         @param variance         Proportion of variance
 
     Returns:
@@ -603,6 +598,14 @@ def _validate_args(schema_madlib,
         if (variance <= 0) or (variance >1):
             plpy.error("""PCA error: components_param must be either
                 a positive integer or a float in the range (0.0,1.0]!""")
+    # confirm output tables are valid
+    if pc_table:
+        _assert(not table_exists(pc_table, only_first_schema=True) and
+                not table_exists(pc_table + '_mean', only_first_schema=True),
+                "PCA error: Output table {pc_table}/{pc_table}_mean "
+                "already exist!".format(pc_table=pc_table))
+    else:
+        plpy.error("PCA error: Invalid output table prefix!")
 
     _assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
             "PCA error: {1} column does not exist in {0}!".
@@ -640,33 +643,18 @@ def _validate_args(schema_madlib,
         if col_dim <= 0:
             plpy.error("PCA error: The column dimension must be larger than 0!")
 
-        validate_sparse(source_table,
-                        {'row': row_id, 'col': col_id, 'val': val_id},
-                        check_col=False)
     if use_correlation:
         plpy.error("PCA error: Using the correlation matrix is not enabled! \
         This value must be set to FALSE")
-# ========================================================================
 
-def _validate_args_output_table(pc_table, result_summary_table=None):
-    """
-        confirm output tables are valid
-        @param pc_table     Name of the output table
-        @param result_summary_table  Name of summary table
-    """
-    if pc_table:
-        _assert(not table_exists(pc_table, only_first_schema=True) and
-                not table_exists(pc_table + '_mean', only_first_schema=True),
-                "PCA error: Output table {pc_table}/{pc_table}_mean "
-                "already exist!".format(pc_table=pc_table))
-    else:
-        plpy.error("PCA error: Invalid output table prefix!")
     if result_summary_table:
         if not result_summary_table.strip():
             plpy.error("PCA error: Invalid result summary table name!")
         _assert(not table_exists(result_summary_table, only_first_schema=True),
                 "PCA error: Result summary table {0} \
                         already exists!".format(result_summary_table))
+# ========================================================================
+
 
 def _recenter_data(schema_madlib, source_table, output_table, row_id,
                    col_name, dimension):

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/edb69dd4/src/ports/postgres/modules/pca/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index 3edf304..9de5559 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -1022,6 +1022,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_union(
     variance              DOUBLE PRECISION,   -- The proportion of variance (Default: NULL)
     grp_id                INTEGER, -- a place holder id for each group
     grouping_where_clause TEXT,    -- WHERE clause using grouping_cols
+    sparse_where_condition TEXT,   -- WHERE clause used when creating temp sparse matrix table with dims
     select_grouping_cols  TEXT,    -- SELECT clause using grouping_cols
     temp_table_columns    TEXT,    -- SELECT caluse for creating temporary copy of the source_table
     is_sparse             BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/edb69dd4/src/ports/postgres/modules/pca/pca_project.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.py_in b/src/ports/postgres/modules/pca/pca_project.py_in
index 1e61d3c..52d14ec 100644
--- a/src/ports/postgres/modules/pca/pca_project.py_in
+++ b/src/ports/postgres/modules/pca/pca_project.py_in
@@ -331,54 +331,17 @@ SELECT {schema_madlib}.pca_sparse_project('usage');
 --
         """.format(schema_madlib=schema_madlib)
 
-
-def _validate_args_output_table(out_table, residual_table=None,
-                   result_summary_table=None):
-    """
-    Validates the output table(s) arguments to the PCA project function
-
-    Args:
-        @param out_table  Name of output table to store projection result
-        @param residual_table  Name of the residual table (to store error in projection)
-        @param result_summary_table  Name of result summary table
-    Returns:
-        None
-    Throws:
-        plpy.error if any argument is invalid
-    """
-    # Make sure that the output table does not exist
-    # Also check that the output table is not null
-    _assert(out_table and out_table.strip(),
-            "PCA error: Invalid output table name.")
-    _assert(not table_exists(out_table, only_first_schema=True),
-            "PCA error: Output table {0} already exists!".format(str(out_table)))
-
-        # Check that the result summary table is not empty
-    if result_summary_table is not None:
-        _assert(result_summary_table.strip(),
-                "PCA error: Invalid result summary table name!")
-        _assert(not table_exists(result_summary_table, only_first_schema=True),
-                "PCA error: Result summary table {0} already exists!".
-                format(result_summary_table))
-
-    # Check that the result summary table is not empty
-    if residual_table is not None:
-        _assert(residual_table.strip(),
-                "PCA error: Invalid residual table name!")
-        _assert(not table_exists(residual_table, only_first_schema=True),
-                "PCA error: Residual table {0} already exists!".
-                format(residual_table))
-
-# Validate arguments: Same as pca
-# ------------------------------------------------------------------------
 def _validate_args(schema_madlib,
                    source_table,
                    pc_table,
+                   out_table,
                    row_id,
                    col_id=None,
                    val_id=None,
                    row_dim=None,
-                   col_dim=None):
+                   col_dim=None,
+                   residual_table=None,
+                   result_summary_table=None):
     """
     Validates all arguments passed to the PCA function
 
@@ -408,6 +371,29 @@ def _validate_args(schema_madlib,
     _assert(table_exists(add_postfix(pc_table, "_mean")),
             "PCA error: Source data table column means does not exist!")
 
+    # Make sure that the output table does not exist
+    # Also check that the output table is not null
+    _assert(out_table and out_table.strip(),
+            "PCA error: Invalid output table name.")
+    _assert(not table_exists(out_table, only_first_schema=True),
+            "PCA error: Output table {0} already exists!".format(str(out_table)))
+
+    # Check that the result summary table is not empty
+    if result_summary_table is not None:
+        _assert(result_summary_table.strip(),
+                "PCA error: Invalid result summary table name!")
+        _assert(not table_exists(result_summary_table, only_first_schema=True),
+                "PCA error: Result summary table {0} already exists!".
+                format(result_summary_table))
+
+    # Check that the result summary table is not empty
+    if residual_table is not None:
+        _assert(residual_table.strip(),
+                "PCA error: Invalid residual table name!")
+        _assert(not table_exists(residual_table, only_first_schema=True),
+                "PCA error: Residual table {0} already exists!".
+                format(residual_table))
+
     # Check that the row_id exists
     _assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
             "PCA error: {1} column does not exist in {0}!".
@@ -428,11 +414,9 @@ def _validate_args(schema_madlib,
         _assert(row_dim > 0 and col_dim > 0,
                 "PCA error: row_dim/col_dim should be positive integer")
 
-        validate_sparse(source_table,
-                        {'row': row_id, 'col': col_id, 'val': val_id},
-                        check_col=False)
 # ------------------------------------------------------------------------
 
+
 def pca_sparse_project(schema_madlib,
                        source_table,
                        pc_table,
@@ -523,6 +507,14 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')
+    if is_sparse:
+        _validate_args(schema_madlib, source_table, pc_table, out_table,
+                   row_id, col_id, val_id, row_dim, col_dim, residual_table,
+                   result_summary_table)
+    else:
+        _validate_args(schema_madlib, source_table, pc_table, out_table,
+                   row_id, None, None, None, None,
+                   residual_table, result_summary_table)
     # If we add new columns to the pca_train output table in the future, they should
     # be included in this list:
     pc_table_model_cols = ['row_id', 'principal_components', 'std_dev', 'proportion']
@@ -530,7 +522,6 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
     grouping_cols = ''
     if grouping_cols_list:
         grouping_cols = ', '.join(grouping_cols_list)
-    _validate_args_output_table(out_table, residual_table, result_summary_table)
 
     other_columns_in_table = [col for col in get_cols(source_table) if col not in grouping_cols_list]
     grouping_cols_clause = ''
@@ -547,7 +538,7 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
         cols_names_types = get_cols_and_types(source_table)
         grouping_cols_clause = ', ' + ', '.join([c_name+" "+c_type
             for (c_name, c_type) in cols_names_types if c_name in grouping_cols_list])
-    ## Create all output tables
+    # Create all output tables
     plpy.execute("""
             DROP TABLE IF EXISTS {0};
             CREATE TABLE {0} (
@@ -567,7 +558,7 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
                 ) """.format(result_summary_table, grouping_cols_clause))
     else:
         result_summary_table = ''
-    if residual_table and grouping_cols:
+    if residual_table:
         plpy.execute("""
             DROP TABLE IF EXISTS {0};
             CREATE TABLE {0} (
@@ -581,25 +572,17 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
     # declare variables whose values will be different for each group, if
     # grouping_cols is specified
     grouping_where_clause = ''
+    sparse_where_condition = ''
     select_grouping_cols = ''
-    temp_source_table_columns = ''
     grouping_cols_values = ''
     result_summary_table_temp = ''
     other_columns_in_pc_table = [col for col in get_cols(pc_table) if col not in grouping_cols_list]
     temp_pc_table_columns = ', '.join(other_columns_in_pc_table)
     original_row_id = row_id
+
     other_columns_in_table.remove(row_id)
-    temp_source_table_columns = """ ROW_NUMBER() OVER({partition_over}) AS row_id,
-                        """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)) +\
-                        ','.join(other_columns_in_table)
-    # We can now ignore the original row_id for all computations since we will
-    # create a new table with a row_id column that has perfect serially increasing
-    # row_id value. This is to support the scenario where users are not forced
-    # to have a row_id that follows a particular format. This restriction of having to
-    # provide a serially increasing row_id value starting from 1 becomes a pain
-    # point when grouping is used, since the row_id for each group will then have
-    # to start from 1.
-    row_id = 'row_id'
+    temp_source_table_columns = ','.join(other_columns_in_table)
+
     pca_union_call_list = []
     grp_id = 0
     if not is_sparse:
@@ -610,15 +593,17 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
     while True:
         if grouping_cols:
             grp_value_dict = distinct_grouping_values[grp_id]
-            grouping_where_clause = ' WHERE ' + ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            where_conditions = ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            sparse_where_condition = ' AND ' + where_conditions
+            grouping_where_clause = ' WHERE ' + where_conditions
             select_grouping_cols = ', ' + ', '.join([str(value)+" AS "+key for (key, value) in grp_value_dict.items()])
             grouping_cols_values = ', ' + ', '.join([str(value) for (key, value) in grp_value_dict.items()])
 
         pca_union_call_list.append("""
             {schema_madlib}._pca_project_union('{source_table}', '{pc_table}', '{out_table}',
                 '{row_id}', '{original_row_id}', '{grouping_cols}',
-                '{grouping_cols_clause}', '{residual_table}',
-                '{result_summary_table}', {grp_id}, '{grouping_where_clause}', '{select_grouping_cols}',
+                '{grouping_cols_clause}', '{residual_table}', '{result_summary_table}',
+                {grp_id}, '{grouping_where_clause}', '{sparse_where_condition}','{select_grouping_cols}',
                 '{grouping_cols_values}', '{temp_source_table_columns}', '{temp_pc_table_columns}',
                 {is_sparse}, '{col_id}', '{val_id}', {row_dim}, {col_dim})
             """.format(schema_madlib=schema_madlib,
@@ -630,6 +615,7 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
                 residual_table=residual_table,
                 result_summary_table=result_summary_table,
                 grp_id=grp_id, grouping_where_clause=grouping_where_clause,
+                sparse_where_condition=sparse_where_condition,
                 select_grouping_cols=select_grouping_cols,
                 grouping_cols_values=grouping_cols_values,
                 temp_source_table_columns=temp_source_table_columns,
@@ -641,31 +627,15 @@ def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
     # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
     # <query_i> in parallel.
     pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
-    try:
-        plpy.execute(pca_union_call)
-    except Exception as e:
-        ## drop the output tables that were created if PCA errored out.
-        plpy.execute("""
-            DROP TABLE IF EXISTS {0};
-            """.format(out_table))
-        if result_summary_table:
-            plpy.execute("""
-                    DROP TABLE IF EXISTS {0}
-                """.format(result_summary_table))
-        if residual_table:
-            plpy.execute("""
-                    DROP TABLE IF EXISTS {0}
-                """.format(residual_table))
-        plpy.error(str(e) + "\n" + str(e.args) + "\n" + str(e.strerror))
-        raise
-
+    plpy.execute(pca_union_call)
     plpy.execute("SET client_min_messages TO %s" % old_msg_level)
 
 def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
         row_id, original_row_id, grouping_cols, grouping_cols_clause,
         residual_table, result_summary_table, grp_id, grouping_where_clause,
-        select_grouping_cols, grouping_cols_values, temp_source_table_columns,
-        temp_pc_table_columns, is_sparse, col_id, val_id, row_dim, col_dim, **kwargs):
+        sparse_where_condition, select_grouping_cols, grouping_cols_values,
+        temp_source_table_columns, temp_pc_table_columns, is_sparse, col_id,
+        val_id, row_dim, col_dim, **kwargs):
     """
     The pca_project is performed over each group, if any.
 
@@ -696,23 +666,10 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
         None
     """
     out_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
-    ## Creation of this temp table is unnecessary if the scenario does not involve
-    ## grouping, and/or, the input table had perfect values for the row_id column.
-    ## This temp table will ensure pca works even when row_id of the source_table
-    ## does not have serially increasing numbers starting from 1;
-    source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
-    plpy.execute("""
-                CREATE TEMP TABLE {source_table_grouped} AS
-                SELECT {temp_source_table_columns}
-                FROM {source_table}
-                {grouping_where_clause}
-            """.format(source_table_grouped=source_table_grouped,
-                source_table=source_table, grouping_where_clause=grouping_where_clause,
-                temp_source_table_columns=temp_source_table_columns))
     if grouping_cols:
         pc_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
         plpy.execute("""
-                CREATE TEMP TABLE {pc_table_grouped} AS
+                CREATE TABLE {pc_table_grouped} AS
                 SELECT {temp_pc_table_columns}
                 FROM {pc_table}
                 {grouping_where_clause}
@@ -725,15 +682,15 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
     t0 = time.time()  # measure the starting time
     # Step 1: Validate the input arguments
     if is_sparse:
-        _validate_args(schema_madlib, source_table_grouped, pc_table, row_id,
-                       col_id, val_id, row_dim, col_dim)
         # Step 1.1: Create a copy of the sparse matrix and add row_dims and col_dims
         # Warning: This changes the column names of the table
         sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
-        create_temp_sparse_matrix_table_with_dims(source_table_grouped, sparse_table_copy,
+        create_temp_sparse_matrix_table_with_dims(source_table, sparse_table_copy,
                                                   row_id, col_id, val_id,
-                                                  row_dim, col_dim)
-
+                                                  row_dim, col_dim, sparse_where_condition)
+        validate_sparse(sparse_table_copy,
+                        {'row': row_id, 'col': col_id, 'val': val_id},
+                        check_col=False)
         # Step 1.2: Densify the input matrix
         x_dense = "pg_temp." + unique_string() + "_dense"
         plpy.execute("""
@@ -747,16 +704,29 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
             DROP TABLE IF EXISTS {0};
             """.format(sparse_table_copy))
         source_table_grouped = x_dense
-        row_id = 'row_id'
     else:
+        # For Dense matrix format only:
+        # We can now ignore the original row_id for all computations since we will
+        # create a new table with a row_id column that has not duplicates and ranges
+        # from 1 to number of rows in the group/table. This is to mainly support the
+        # grouping scneario where the row_id values might not range between 1 and
+        # number of rows in the group, for each group. Doing this also just extends
+        # this behavior for non-grouping scenarios too. If creating a new temp table
+        # that corrects the row_id column is not of much importance in non-grouping
+        # cases, we can avoid creating the temp table and save some computation time.
+        # But, at the moment, the code creates the temp table even for the non-grouping
+        # scenario.
+        # We don't need to do this for sparse representation because of the nature
+        # of its definition.
+
         # Preserve the mapping between new row_id created and the original row_id. This is
         # required only for dense input format.
         temp_row_id = original_row_id + unique_string()
         row_id_map_table = "rowid" + unique_string()
         plpy.execute("""
-                CREATE TEMP TABLE {row_id_map_table} AS
+                CREATE TABLE {row_id_map_table} AS
                 SELECT
-                    {original_row_id} AS {temp_row_id},
+                    {source_table}.{original_row_id} AS {temp_row_id},
                     {select_clause}
                 FROM {source_table}
                 {grouping_where_clause}
@@ -764,14 +734,31 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
                 original_row_id=original_row_id,
                 temp_row_id=temp_row_id,
                 source_table=source_table,
-                select_clause="""
-                    ROW_NUMBER() OVER({partition_over}) AS row_id
-                """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)),
+                select_clause=""" ROW_NUMBER() OVER() AS row_id """,
                 grouping_where_clause=grouping_where_clause))
-        ## Validate the arguments
-        _validate_args(schema_madlib, source_table_grouped, pc_table,
-                       row_id, None, None, None, None)
 
+        # Creation of this temp table is unnecessary if the scenario does not involve
+        # grouping, and/or, the input table had perfect values for the row_id column.
+        # This temp table will ensure pca works even when row_id of the source_table
+        # does not have serially increasing numbers starting from 1;
+        source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+        plpy.execute("""
+                    CREATE TABLE {source_table_grouped} AS
+                    SELECT {row_id_map_table}.row_id, {temp_source_table_columns}
+                    FROM
+                        (
+                            SELECT *
+                            FROM {source_table}
+                            {grouping_where_clause}
+                        ) t1
+                    INNER JOIN {row_id_map_table}
+                    ON {row_id_map_table}.{temp_row_id}=t1.{row_id}
+                """.format(source_table_grouped=source_table_grouped,
+                    temp_row_id=temp_row_id, row_id_map_table=row_id_map_table, row_id=row_id,
+                    source_table=source_table, grouping_where_clause=grouping_where_clause,
+                    temp_source_table_columns=temp_source_table_columns))
+
+    row_id = 'row_id'
     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string()
     need_new_column_names = cast_dense_input_table_to_correct_columns(
@@ -779,7 +766,6 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
 
     if(need_new_column_names):
         source_table_grouped = source_table_copy
-
     [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
@@ -833,10 +819,7 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
     # Residual table: res = mat - proj
     create_residual_table = False
     if residual_table or result_summary_table:
-        if grouping_cols:
-            residual_table_grouped = "pg_temp." + unique_string() + "_temp_residual"
-        else:
-            residual_table_grouped = residual_table
+        residual_table_grouped = "pg_temp." + unique_string() + "_temp_residual"
         create_temp_residual_table = False
         if not residual_table:
             create_temp_residual_table = True
@@ -928,14 +911,13 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
     if is_sparse:
         ## We don't have to join based on row_id for sparse project.
         if create_residual_table:
-            if grouping_cols:
-                plpy.execute("""
-                        INSERT INTO {residual_table}
-                        SELECT * {select_grouping_cols}
-                        FROM {residual_table_grouped}
-                    """.format(residual_table=residual_table,
-                        select_grouping_cols=select_grouping_cols,
-                        residual_table_grouped=residual_table_grouped))
+            plpy.execute("""
+                    INSERT INTO {residual_table}
+                    SELECT * {select_grouping_cols}
+                    FROM {residual_table_grouped}
+                """.format(residual_table=residual_table,
+                    select_grouping_cols=select_grouping_cols,
+                    residual_table_grouped=residual_table_grouped))
         plpy.execute("""
                 INSERT INTO {out_table}
                 SELECT * {select_grouping_cols}
@@ -954,21 +936,16 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
                 out_table_cols=', '.join(output_table_cols),
                 select_grouping_cols=select_grouping_cols)
         if create_residual_table:
-            if grouping_cols:
-                plpy.execute("""
-                    INSERT INTO {residual_table}
-                    SELECT {select_clause}
-                    FROM {residual_table_grouped}
-                    INNER JOIN {row_id_map_table}
-                    ON {row_id_map_table}.row_id={residual_table_grouped}.row_id
-                    """.format(residual_table=residual_table,
-                        select_clause=output_table_select_clause,
-                        residual_table_grouped=residual_table_grouped,
-                        row_id_map_table=row_id_map_table))
-                plpy.execute("""
-                        DROP TABLE IF EXISTS {0}
-                    """.format(residual_table_grouped))
-
+            plpy.execute("""
+                INSERT INTO {residual_table}
+                SELECT {select_clause}
+                FROM {residual_table_grouped}
+                INNER JOIN {row_id_map_table}
+                ON {row_id_map_table}.row_id={residual_table_grouped}.row_id
+                """.format(residual_table=residual_table,
+                    select_clause=output_table_select_clause,
+                    residual_table_grouped=residual_table_grouped,
+                    row_id_map_table=row_id_map_table))
         plpy.execute("""
                     INSERT INTO {out_table}
                     SELECT {select_clause}
@@ -982,6 +959,10 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
         plpy.execute("""
                 DROP TABLE IF EXISTS {0};
             """.format(row_id_map_table))
+    if residual_table or result_summary_table:
+        plpy.execute("""
+                DROP TABLE IF EXISTS {0}
+            """.format(residual_table_grouped))
     plpy.execute("""
             DROP TABLE IF EXISTS {0};
             DROP TABLE IF EXISTS {1};

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/edb69dd4/src/ports/postgres/modules/pca/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.sql_in b/src/ports/postgres/modules/pca/pca_project.sql_in
index 814292c..0a0a69c 100644
--- a/src/ports/postgres/modules/pca/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/pca_project.sql_in
@@ -682,6 +682,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_project_union(
     result_summary_table        TEXT,    -- Table name to store summary of results (Default: NULL)
     grp_id                      INTEGER, -- a place holder id for each group
     grouping_where_clause       TEXT,    -- WHERE clause using grouping_cols
+    sparse_where_condition      TEXT,   -- WHERE clause used when creating temp sparse matrix table with dims
     select_grouping_cols        TEXT,    -- SELECT clause using grouping_cols
     grouping_cols_values        TEXT,    -- distinct values of the grouping_cols
     temp_source_table_columns   TEXT,    -- SELECT caluse for creating temporary copy of the source_table

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/edb69dd4/src/ports/postgres/modules/pca/test/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/test/pca.sql_in b/src/ports/postgres/modules/pca/test/pca.sql_in
index 20093ac..12d8ab1 100644
--- a/src/ports/postgres/modules/pca/test/pca.sql_in
+++ b/src/ports/postgres/modules/pca/test/pca.sql_in
@@ -308,48 +308,46 @@ select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
 select * from result_table_214712398172490837;
 select * from result_table_214712398172490838;
 
+-------------------------------------------------------------------------
+-- test a different column name
+alter table sparse_mat rename column row_id to rownr;
+alter table sparse_mat rename column col_id to colnr;
+alter table sparse_mat rename column val_id to vals;
+
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
+'rownr', 'colnr', 'vals', 10, 10, 10);
+select * from result_table_214712398172490837;
+
 -- Sparse input data with grouping column
-DROP TABLE IF EXISTS sparse_mat;
-CREATE TABLE sparse_mat (
+DROP TABLE IF EXISTS sparse_mat_grp;
+CREATE TABLE sparse_mat_grp (
     id integer,
     col_id integer,
     val_id integer,
     grp    integer
 );
-COPY sparse_mat (id, col_id, val_id, grp) FROM stdin delimiter '|';
+COPY sparse_mat_grp (id, col_id, val_id, grp) FROM stdin delimiter '|';
 1|2|4|1
 1|5|6|1
 3|8|4|1
 5|4|2|1
-6|6|12|2
-8|7|2|2
-8|1|2|2
-9|8|2|2
-9|3|4|2
+1|2|4|2
+1|5|6|2
+3|8|4|2
+5|4|2|2
 \.
 -- Learn individaul PCA models based on grouping column (grp)
 drop table if exists result_table_214712398172490837;
 drop table if exists result_table_214712398172490837_mean;
 drop table if exists result_table_214712398172490838;
-select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
+select pca_sparse_train('sparse_mat_grp', 'result_table_214712398172490837',
 'id', 'col_id', 'val_id', 10, 10, 0.8, 'grp', 0, FALSE, 'result_table_214712398172490838');
 select * from result_table_214712398172490837;
 select * from result_table_214712398172490838;
 
 -------------------------------------------------------------------------
--- test a different column name
-alter table sparse_mat rename column id to rownr;
-alter table sparse_mat rename column col_id to colnr;
-alter table sparse_mat rename column val_id to vals;
-
-drop table if exists result_table_214712398172490837;
-drop table if exists result_table_214712398172490837_mean;
-select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
-'rownr', 'colnr', 'vals', 10, 10, 10);
-select * from result_table_214712398172490837;
-
-
--------------------------------------------------------------------------
 drop table if exists mat;
 -- Check the second input matrix format produces the same results as the first format
 CREATE TABLE mat (

[5/6] incubator-madlib git commit: Handle error condition with international character

Posted by nj...@apache.org.

Handle error condition with international character

One of the temp table names was using the source_table name
appended with the unique_string() output. But this might cause
issues if international chars were used in the input table name.
Avoiding such scenarios.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/7e577458
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/7e577458
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/7e577458

Branch: refs/heads/master
Commit: 7e57745860e65d6e86437346e0f33d3d74566704
Parents: 4e1e96a
Author: Nandish Jayaram <nj...@users.noreply.github.com>
Authored: Tue Jan 10 16:35:35 2017 -0800
Committer: Nandish Jayaram <nj...@users.noreply.github.com>
Committed: Wed Jan 18 14:05:09 2017 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/pca/pca.py_in         | 2 +-
 src/ports/postgres/modules/pca/pca_project.py_in | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/7e577458/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in b/src/ports/postgres/modules/pca/pca.py_in
index b6609f2..5b660a8 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -199,7 +199,7 @@ def pca_wrap(schema_madlib, source_table, pc_table, row_id,
         col_dim = 0
     while True:
         if result_summary_table:
-            result_summary_table_temp = "pg_temp." + result_summary_table + unique_string() + "_" + str(grp_id)
+            result_summary_table_temp = "pg_temp." + unique_string() + "_" + str(grp_id)
         if grouping_cols:
             grp_value_dict = distinct_grouping_values[grp_id]
             where_conditions = ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/7e577458/src/ports/postgres/modules/pca/pca_project.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.py_in b/src/ports/postgres/modules/pca/pca_project.py_in
index 52d14ec..62bf2b1 100644
--- a/src/ports/postgres/modules/pca/pca_project.py_in
+++ b/src/ports/postgres/modules/pca/pca_project.py_in
@@ -721,7 +721,7 @@ def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
 
         # Preserve the mapping between new row_id created and the original row_id. This is
         # required only for dense input format.
-        temp_row_id = original_row_id + unique_string()
+        temp_row_id = "original_row_id" + unique_string()
         row_id_map_table = "rowid" + unique_string()
         plpy.execute("""
                 CREATE TABLE {row_id_map_table} AS

[3/6] incubator-madlib git commit: PCA: Add grouping support to PCA

Posted by nj...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/pca_project.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.py_in b/src/ports/postgres/modules/pca/pca_project.py_in
index 98d9e3f..1e61d3c 100644
--- a/src/ports/postgres/modules/pca/pca_project.py_in
+++ b/src/ports/postgres/modules/pca/pca_project.py_in
@@ -19,6 +19,8 @@ from utilities.utilities import _array_to_string
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
 from utilities.utilities import add_postfix
+from utilities.validate_args import get_cols, get_cols_and_types
+from utilities.control import MinWarning
 
 
 version_wrapper = __mad_version()
@@ -27,69 +29,149 @@ array_to_string = version_wrapper.select_vec_return()
 ZERO_THRESHOLD = 1e-6
 
 
-# Dense PCA help function
+# Dense PCA project help function
 def pca_project_help(schema_madlib, usage_string=None, **kwargs):
     """
     Given a usage string, give out function usage information.
     """
-    if usage_string is None:
-        usage_string = ''
-
-    if (usage_string.lower() == "usage"):
+    if usage_string is not None and \
+            usage_string.lower() in ("usage", "help", "?"):
         return """
-        ----------------------------------------------------------------
-                                Usage
-        ----------------------------------------------------------------
-        SELECT {schema_madlib}.pca_project (
-            'tbl_source',          -- Data table
-            'tbl_pc',              -- Table with principal componenents
-                                        (obtained as output from pca_train)
-            'tbl_result',          -- Result table
-            'row_id',              -- Name of the column containing the row_id
-            -- Optional Parameters
-            ----------------------------------------------------------------
-            'tbl_residual',        -- Residual table (Default: NULL)
-            'tbl_result_summary',  -- Result summary table (Default : NULL)
-        );
-
-                                Output Tables
-        --------------------------------------------------------------------
-        The output is divided into three tables (two of which are optional)
-
-        --------------------------------------------------------------------
-         The output table ('tbl_result' above) encodes a dense matrix
-         with the projection onto the principal components. The matrix contains
-         the following columns:
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        --------------------------------------------------------------------
-         The residual table ('tbl_residual' above) encodes a dense residual
-         matrix which has the following columns
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        --------------------------------------------------------------------
-         The result summary table ('tbl_result_summary' above) has the following columns
-
-          'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
-          'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
-          'relative_residual_norm'   DOUBLE PRECISION    -- Relative error of the residuals
-        ----------------------------------------------------------------
+----------------------------------------------------------------
+                        Usage
+----------------------------------------------------------------
+SELECT {schema_madlib}.pca_project (
+    'tbl_source',          -- Data table
+    'pc_table',            -- Table with principal componenents
+                                (obtained as output from pca_train)
+    'tbl_result',          -- Result table
+    'row_id',              -- Name of the column containing the row_id
+    -- Optional Parameters
+    ----------------------------------------------------------------
+    'tbl_residual',        -- Residual table (Default: NULL)
+    'tbl_result_summary',  -- Result summary table (Default : NULL)
+);
+
+Note that if the principal components in pc_table were learnt using
+grouping_cols in {schema_madlib}.pca_train(), the tbl_source used
+here must also have those grouping columns. This will fail otherwise.
+
+                        Output Tables
+--------------------------------------------------------------------
+The output is divided into three tables (two of which are optional)
+
+--------------------------------------------------------------------
+ The output table ('tbl_result' above) encodes a dense matrix
+ with the projection onto the principal components. The matrix contains
+ the following columns:
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+--------------------------------------------------------------------
+ The residual table ('tbl_residual' above) encodes a dense residual
+ matrix which has the following columns
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+--------------------------------------------------------------------
+ The result summary table ('tbl_result_summary' above) has the following columns
+
+  'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
+  'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
+  'relative_residual_norm'   DOUBLE PRECISION,   -- Relative error of the residuals
+  grouping_col                                   -- The grouping columns present in the 'pc_table', if any
+----------------------------------------------------------------
         """.format(schema_madlib=schema_madlib)
     else:
-        return """
-        ----------------------------------------------------------------
-                 Summary: PCA Projection
-        ----------------------------------------------------------------
-        PCA Projection: Projects a dataset to an already trained
-        space of principal components.
-        --
-        For function usage information, run
-        SELECT {schema_madlib}.pca_project('usage');
-        --
+        if usage_string is not None and \
+                usage_string.lower() in ("example", "examples"):
+            return """
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
+-- Run pca_project() using a model table generated without grouping_cols.
+-- Create input table for pca_project()
+
+DROP TABLE IF EXISTS mat_proj;
+CREATE TABLE mat_proj (
+    row_id integer,
+    row_vec double precision[]
+);
+COPY mat_proj (row_id, row_vec) FROM stdin DELIMITER '|';
+1|{{1,2,3}}
+2|{{2,1,2}}
+3|{{3,2,1}}
+11|{{1,2,3}}
+21|{{2,1,2}}
+31|{{3,2,1}}
+41|{{1,2,4}}
+12|{{1,3,3}}
+\.
+
+-- NOTE: Use the 'result_table' created using the example shown in
+-- {schema_madlib}.pca_train('examples'), as the 'pc_table' parameter here.
+
+DROP TABLE IF EXISTS mat_proj_out;
+SELECT {schema_madlib}.pca_project(
+    'mat_proj',
+    'result_table',
+    'mat_proj_out',
+    'row_id'
+    );
+
+SELECT * FROM mat_proj_out;
+
+-----------------------------------------------------------------------
+
+-- Run pca_project() using a model table generated with grouping_cols.
+-- Create input table for pca_project(), with grouping
+
+DROP TABLE IF EXISTS mat_proj_grouped;
+CREATE TABLE mat_proj_grouped (
+    row_id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+COPY mat_proj_grouped (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
+1|{{1,2,3}}|1
+2|{{2,1,2}}|1
+3|{{3,2,1}}|1
+4|{{1,2,3,4,5}}|2
+5|{{2,1,2,4,5}}|2
+6|{{3,2,1,4,5}}|2
+\.
+
+-- NOTE: Use the 'result_table_grp' created using the example shown
+-- in {schema_madlib}.pca_train('examples'), as the 'pc_table' parameter
+-- here. 'result_table_grp' was created with 'matrix_id' as the
+-- grouping column, and the table 'mat_proj_grouped' should also have the
+-- 'matrix_id' column in it.
+
+DROP TABLE IF EXISTS mat_proj_grouped_out;
+SELECT {schema_madlib}.pca_project(
+    'mat_proj_grouped',
+    'result_table_grp',
+    'mat_proj_grouped_out',
+    'row_id'
+    );
+
+SELECT * FROM mat_proj_grouped_out;
+            """.format(schema_madlib=schema_madlib)
+        else:
+            return """
+----------------------------------------------------------------
+         Summary: PCA Projection
+----------------------------------------------------------------
+PCA Projection: Projects a dataset to an already trained
+space of principal components.
+--
+For function usage information, run
+SELECT {schema_madlib}.pca_project('usage');
+--
         """.format(schema_madlib=schema_madlib)
 
 
@@ -99,84 +181,204 @@ def pca_sparse_project_help(schema_madlib, usage_string=None, **kwargs):
     """
     Given a usage string, give out function usage information.
     """
-    if usage_string is None:
-        usage_string = ''
-
-    if (usage_string.lower() == "usage"):
+    if usage_string is not None and \
+            usage_string.lower() in ("usage", "help", "?"):
         return """
-        ----------------------------------------------------------------
-                                Usage
-        ----------------------------------------------------------------
-        SELECT {schema_madlib}.pca_sparse_project (
-            'tbl_source',          -- Data table
-            'tbl_pc',              -- Table with principal componenents
-                                        (obtained as output from pca_train)
-            'tbl_result',          -- Result table
-            'row_id',              -- Name of the column containing the row_id
-            'col_id',              -- Name of the column containing the col_id
-            'val_id',              -- Name of the column containing the val_id
-            'row_dim'              -- Row dimension of the sparse matrix
-            'col_dim'              -- Column dimension of the sparse matrix
-            -- Optional Parameters
-            ----------------------------------------------------------------
-            'tbl_residual',        -- Residual table (Default: NULL)
-            'tbl_result_summary',  -- Result summary table (Default : NULL)
-        );
-
-                                Output Tables
-        ----------------------------------------------------------------
-        The output is divided into three tables (two of which are optional)
-
-        -----------------------------------------------------------------------------------------
-         The output table ('tbl_result' above) encodes a dense matrix
-         with the projection onto the principal components. The matrix contains
-         the following columns:
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        -----------------------------------------------------------------------------------------
-         The residual table ('tbl_residual' above) encodes a dense residual
-         matrix which has the following columns
-
-         'row_id'        INTEGER,            -- Row id of the output matrix
-         'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
-        -----------------------------------------------------------------------------------------
-         The result summary table ('tbl_result_summary' above) has the following columns
-
-          'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
-          'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
-          'relative_residual_norm'   DOUBLE PRECISION    -- Relative error of the residuals
-        ----------------------------------------------------------------
+----------------------------------------------------------------
+                        Usage
+----------------------------------------------------------------
+SELECT {schema_madlib}.pca_sparse_project (
+    'tbl_source',          -- Data table
+    'pc_table',            -- Table with principal componenents
+                                (obtained as output from pca_train)
+    'tbl_result',          -- Result table
+    'row_id',              -- Name of the column containing the row_id
+    'col_id',              -- Name of the column containing the col_id
+    'val_id',              -- Name of the column containing the val_id
+    'row_dim'              -- Row dimension of the sparse matrix
+    'col_dim'              -- Column dimension of the sparse matrix
+    -- Optional Parameters
+    ----------------------------------------------------------------
+    'tbl_residual',        -- Residual table (Default: NULL)
+    'tbl_result_summary',  -- Result summary table (Default : NULL)
+);
+
+Note that if the principal components in 'pc_table' were learnt using
+grouping_cols in {schema_madlib}.pca_train(), the tbl_source used
+here must also have those grouping columns. This will fail otherwise.
+
+                        Output Tables
+----------------------------------------------------------------
+The output is divided into three tables (two of which are optional)
+
+-----------------------------------------------------------------------------------------
+ The output table ('tbl_result' above) encodes a dense matrix
+ with the projection onto the principal components. The matrix contains
+ the following columns:
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+-----------------------------------------------------------------------------------------
+ The residual table ('tbl_residual' above) encodes a dense residual
+ matrix which has the following columns
+
+ 'row_id'        INTEGER,            -- Row id of the output matrix
+ 'row_vec'       DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col                        -- The grouping columns present in the 'pc_table', if any
+
+-----------------------------------------------------------------------------------------
+ The result summary table ('tbl_result_summary' above) has the following columns
+
+  'exec_time'                INTEGER,            -- Wall clock time (ms) of the function.
+  'residual_norm'            DOUBLE PRECISION,   -- Absolute error of the residuals
+  'relative_residual_norm'   DOUBLE PRECISION,   -- Relative error of the residuals
+  grouping_col                                   -- The grouping columns present in the 'pc_table', if any
+----------------------------------------------------------------
         """.format(schema_madlib=schema_madlib)
     else:
-        return """
-        ----------------------------------------------------------------
-                 Summary: PCA Projection
-        ----------------------------------------------------------------
-        PCA Projection: Projects a dataset to an already trained
-        space of principal components.
-        --
-        For function usage information, run
-        SELECT {schema_madlib}.pca_sparse_project('usage');
-        --
+        if usage_string is not None and \
+                usage_string.lower() in ("example", "examples"):
+            return """
+----------------------------------------------------------------
+                        Examples
+----------------------------------------------------------------
+-- Run pca_sparse_project() using a model table generated without grouping_cols.
+-- Create input table for pca_sparse_project()
+
+DROP TABLE IF EXISTS sparse_proj_mat;
+CREATE TABLE sparse_proj_mat (
+    row_id integer,
+    col_id integer,
+    val_id integer
+);
+COPY sparse_proj_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
+1|2|4
+1|5|6
+3|8|4
+8|1|2
+8|7|2
+9|3|4
+9|8|2
+\.
+
+-- NOTE: Use the 'result_table_sparse' created using the example shown in
+-- {schema_madlib}.pca_sparse_train('examples'), as the 'pc_table' parameter here.
+
+SELECT {schema_madlib}.pca_sparse_project(
+    'sparse_proj_mat',
+    'result_table_sparse',
+    'sparse_proj_mat_out',
+    'row_id',
+    'col_id',
+    'val_id',
+    10,
+    10
+    );
+
+SELECT * FROM sparse_proj_mat_out;
+
+
+-- Run pca_sparse_project() using a model table generated with grouping_cols.
+-- Create input table for pca_sparse_project(), with grouping
+
+DROP TABLE IF EXISTS sparse_proj_mat_with_grouping;
+CREATE TABLE sparse_proj_mat_with_grouping (
+    row_id integer,
+    col_id integer,
+    val_id integer,
+    matrix_id integer
+);
+COPY sparse_proj_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
+8|7|2|1
+9|3|4|1
+9|8|2|1
+1|2|4|2
+1|5|6|2
+6|6|12|2
+\.
+
+-- NOTE: Use the 'result_table_sparsed_grouped' created using the example shown
+-- in {schema_madlib}.pca_sparse_train('examples'), as the 'pc_table' parameter
+-- here. 'result_table_sparsed_grouped' was created with 'matrix_id' as the
+-- grouping column, and the table 'sparse_proj_mat_with_grouping' should also have
+-- the 'matrix_id' column in it.
+
+SELECT {schema_madlib}.pca_sparse_project(
+    'sparse_proj_mat_with_grouping',
+    'result_table_sparsed_grouped',
+    'sparse_proj_mat_with_grouping_out',
+    'row_id',
+    'col_id',
+    'val_id',
+    10,
+    10
+    );
+
+SELECT * FROM sparse_proj_mat_with_grouping_out;
+            """.format(schema_madlib=schema_madlib)
+        else:
+            return """
+----------------------------------------------------------------
+         Summary: PCA Projection
+----------------------------------------------------------------
+PCA Projection: Projects a dataset to an already trained
+space of principal components.
+--
+For function usage information, run:
+SELECT {schema_madlib}.pca_sparse_project('usage');
+--
         """.format(schema_madlib=schema_madlib)
 
 
+def _validate_args_output_table(out_table, residual_table=None,
+                   result_summary_table=None):
+    """
+    Validates the output table(s) arguments to the PCA project function
+
+    Args:
+        @param out_table  Name of output table to store projection result
+        @param residual_table  Name of the residual table (to store error in projection)
+        @param result_summary_table  Name of result summary table
+    Returns:
+        None
+    Throws:
+        plpy.error if any argument is invalid
+    """
+    # Make sure that the output table does not exist
+    # Also check that the output table is not null
+    _assert(out_table and out_table.strip(),
+            "PCA error: Invalid output table name.")
+    _assert(not table_exists(out_table, only_first_schema=True),
+            "PCA error: Output table {0} already exists!".format(str(out_table)))
+
+        # Check that the result summary table is not empty
+    if result_summary_table is not None:
+        _assert(result_summary_table.strip(),
+                "PCA error: Invalid result summary table name!")
+        _assert(not table_exists(result_summary_table, only_first_schema=True),
+                "PCA error: Result summary table {0} already exists!".
+                format(result_summary_table))
+
+    # Check that the result summary table is not empty
+    if residual_table is not None:
+        _assert(residual_table.strip(),
+                "PCA error: Invalid residual table name!")
+        _assert(not table_exists(residual_table, only_first_schema=True),
+                "PCA error: Residual table {0} already exists!".
+                format(residual_table))
+
 # Validate arguments: Same as pca
 # ------------------------------------------------------------------------
 def _validate_args(schema_madlib,
                    source_table,
                    pc_table,
-                   out_table,
                    row_id,
                    col_id=None,
                    val_id=None,
                    row_dim=None,
-                   col_dim=None,
-                   residual_table=None,
-                   result_summary_table=None):
+                   col_dim=None):
     """
     Validates all arguments passed to the PCA function
 
@@ -206,29 +408,6 @@ def _validate_args(schema_madlib,
     _assert(table_exists(add_postfix(pc_table, "_mean")),
             "PCA error: Source data table column means does not exist!")
 
-    # Make sure that the output table does not exist
-    # Also check that the output table is not null
-    _assert(out_table and out_table.strip(),
-            "PCA error: Invalid output table name.")
-    _assert(not table_exists(out_table, only_first_schema=True),
-            "PCA error: Output table {0} already exists!".format(str(out_table)))
-
-    # Check that the result summary table is not empty
-    if result_summary_table is not None:
-        _assert(result_summary_table.strip(),
-                "PCA error: Invalid result summary table name!")
-        _assert(not table_exists(result_summary_table, only_first_schema=True),
-                "PCA error: Result summary table {0} already exists!".
-                format(result_summary_table))
-
-    # Check that the result summary table is not empty
-    if residual_table is not None:
-        _assert(residual_table.strip(),
-                "PCA error: Invalid residual table name!")
-        _assert(not table_exists(residual_table, only_first_schema=True),
-                "PCA error: Residual table {0} already exists!".
-                format(residual_table))
-
     # Check that the row_id exists
     _assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
             "PCA error: {1} column does not exist in {0}!".
@@ -254,7 +433,6 @@ def _validate_args(schema_madlib,
                         check_col=False)
 # ------------------------------------------------------------------------
 
-
 def pca_sparse_project(schema_madlib,
                        source_table,
                        pc_table,
@@ -290,65 +468,9 @@ def pca_sparse_project(schema_madlib,
     Throws:
         plpy.error if any argument is invalid
     """
-
-    # Reset the message level to avoid random messages
-    old_msg_level = plpy.execute("""
-                                  SELECT setting
-                                  FROM pg_settings
-                                  WHERE name='client_min_messages'
-                                  """)[0]['setting']
-    plpy.execute('SET client_min_messages TO warning')
-
-    # Step 1: Validate the input arguments
-    _validate_args(schema_madlib,
-                   source_table,
-                   pc_table,
-                   out_table,
-                   row_id,
-                   col_id,
-                   val_id,
-                   row_dim,
-                   col_dim,
-                   residual_table,
-                   result_summary_table)
-
-    # Step 2: Create a copy of the sparse matrix and add row_dims and col_dims
-    # Warning: This changes the column names of the table
-    sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
-    create_temp_sparse_matrix_table_with_dims(source_table,
-                                              sparse_table_copy,
-                                              row_id,
-                                              col_id,
-                                              val_id,
-                                              row_dim,
-                                              col_dim)
-
-    # Step 3: Densify the input matrix
-    x_dense = "pg_temp." + unique_string() + "_dense"
-    plpy.execute("""
-        SELECT {schema_madlib}.matrix_densify(
-            '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
-            '{x_dense}', 'row=row_id, col=col_id,val=row_vec')
-        """.format(**locals()))
-
-    # Step 4: Pass the densified matrix to regular PCA
-    pca_project(schema_madlib,
-                x_dense,
-                pc_table,
-                out_table,
-                'row_id',
-                residual_table,
-                result_summary_table)
-
-    # Step 4: Clean up
-    plpy.execute(
-        """
-        DROP TABLE IF EXISTS {x_dense};
-        DROP TABLE IF EXISTS {sparse_table_copy};
-        """.format(x_dense=x_dense,
-                   sparse_table_copy=sparse_table_copy))
-
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+    pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+                       row_id, residual_table, result_summary_table,
+                       True, col_id, val_id, row_dim, col_dim)
 
 
 # ------------------------------------------------------------------------
@@ -379,8 +501,21 @@ def pca_project(schema_madlib,
     Throws:
         plpy.error if any argument is invalid
     """
-    t0 = time.time()  # measure the starting time
+    pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+                       row_id, residual_table, result_summary_table)
 
+
+def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+                       row_id, residual_table,
+                       result_summary_table, is_sparse=False,
+                       col_id=None, val_id=None, row_dim=None,
+                       col_dim=None, **kwargs):
+    """
+    This wrapper was added to support grouping columns. This
+    function does the necessary pre-processing for handling
+    grouping_cols, if set. It then constructs a single query that
+    includes a separate "madlib.pca_project_wrap(...)" for each group.
+    """
     # Reset the message level to avoid random messages
     old_msg_level = plpy.execute("""
                                   SELECT setting
@@ -388,24 +523,267 @@ def pca_project(schema_madlib,
                                   WHERE name='client_min_messages'
                                   """)[0]['setting']
     plpy.execute('SET client_min_messages TO warning')
+    # If we add new columns to the pca_train output table in the future, they should
+    # be included in this list:
+    pc_table_model_cols = ['row_id', 'principal_components', 'std_dev', 'proportion']
+    grouping_cols_list = [col for col in get_cols(pc_table) if col not in pc_table_model_cols]
+    grouping_cols = ''
+    if grouping_cols_list:
+        grouping_cols = ', '.join(grouping_cols_list)
+    _validate_args_output_table(out_table, residual_table, result_summary_table)
+
+    other_columns_in_table = [col for col in get_cols(source_table) if col not in grouping_cols_list]
+    grouping_cols_clause = ''
+    if(grouping_cols):
+        # validate the grouping columns. We currently only support grouping_cols
+        # to be column names in the source_table, and not expressions!
+        _assert(columns_exist_in_table(source_table, grouping_cols_list, schema_madlib),
+                """PCA error: One or more grouping columns in {0} do not exist in {1}, but
+                the model in {2} was learnt with grouping!""".format(grouping_cols,
+                    source_table, pc_table))
+        distinct_grouping_values = plpy.execute("""
+                SELECT DISTINCT {grouping_cols} FROM {source_table}
+            """.format(grouping_cols=grouping_cols, source_table=source_table))
+        cols_names_types = get_cols_and_types(source_table)
+        grouping_cols_clause = ', ' + ', '.join([c_name+" "+c_type
+            for (c_name, c_type) in cols_names_types if c_name in grouping_cols_list])
+    ## Create all output tables
+    plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            CREATE TABLE {0} (
+                row_id      INTEGER,
+                row_vec     double precision[]
+                {1}
+            ) """.format(out_table, grouping_cols_clause))
+    if result_summary_table:
+        plpy.execute(
+                """
+                DROP TABLE IF EXISTS {0};
+                CREATE TABLE {0} (
+                    exec_time               FLOAT8,
+                    residual_norm           FLOAT8,
+                    relative_residual_norm  FLOAT8
+                    {1}
+                ) """.format(result_summary_table, grouping_cols_clause))
+    else:
+        result_summary_table = ''
+    if residual_table and grouping_cols:
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            CREATE TABLE {0} (
+                row_id      INTEGER,
+                row_vec     double precision[]
+                {1}
+            ) """.format(residual_table, grouping_cols_clause))
+    if not residual_table:
+        residual_table = ''
+
+    # declare variables whose values will be different for each group, if
+    # grouping_cols is specified
+    grouping_where_clause = ''
+    select_grouping_cols = ''
+    temp_source_table_columns = ''
+    grouping_cols_values = ''
+    result_summary_table_temp = ''
+    other_columns_in_pc_table = [col for col in get_cols(pc_table) if col not in grouping_cols_list]
+    temp_pc_table_columns = ', '.join(other_columns_in_pc_table)
+    original_row_id = row_id
+    other_columns_in_table.remove(row_id)
+    temp_source_table_columns = """ ROW_NUMBER() OVER({partition_over}) AS row_id,
+                        """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)) +\
+                        ','.join(other_columns_in_table)
+    # We can now ignore the original row_id for all computations since we will
+    # create a new table with a row_id column that has perfect serially increasing
+    # row_id value. This is to support the scenario where users are not forced
+    # to have a row_id that follows a particular format. This restriction of having to
+    # provide a serially increasing row_id value starting from 1 becomes a pain
+    # point when grouping is used, since the row_id for each group will then have
+    # to start from 1.
+    row_id = 'row_id'
+    pca_union_call_list = []
+    grp_id = 0
+    if not is_sparse:
+        col_id = 'NULL'
+        val_id = 'NULL'
+        row_dim = 0
+        col_dim = 0
+    while True:
+        if grouping_cols:
+            grp_value_dict = distinct_grouping_values[grp_id]
+            grouping_where_clause = ' WHERE ' + ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+            select_grouping_cols = ', ' + ', '.join([str(value)+" AS "+key for (key, value) in grp_value_dict.items()])
+            grouping_cols_values = ', ' + ', '.join([str(value) for (key, value) in grp_value_dict.items()])
+
+        pca_union_call_list.append("""
+            {schema_madlib}._pca_project_union('{source_table}', '{pc_table}', '{out_table}',
+                '{row_id}', '{original_row_id}', '{grouping_cols}',
+                '{grouping_cols_clause}', '{residual_table}',
+                '{result_summary_table}', {grp_id}, '{grouping_where_clause}', '{select_grouping_cols}',
+                '{grouping_cols_values}', '{temp_source_table_columns}', '{temp_pc_table_columns}',
+                {is_sparse}, '{col_id}', '{val_id}', {row_dim}, {col_dim})
+            """.format(schema_madlib=schema_madlib,
+                source_table=source_table, pc_table=pc_table,
+                out_table=out_table, row_id=row_id,
+                original_row_id=original_row_id,
+                grouping_cols=grouping_cols,
+                grouping_cols_clause=grouping_cols_clause,
+                residual_table=residual_table,
+                result_summary_table=result_summary_table,
+                grp_id=grp_id, grouping_where_clause=grouping_where_clause,
+                select_grouping_cols=select_grouping_cols,
+                grouping_cols_values=grouping_cols_values,
+                temp_source_table_columns=temp_source_table_columns,
+                temp_pc_table_columns=temp_pc_table_columns, is_sparse=is_sparse,
+                col_id=col_id, val_id=val_id, row_dim=row_dim, col_dim=col_dim))
+        grp_id += 1
+        if not grouping_cols_list or len(distinct_grouping_values) == grp_id:
+            break
+    # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
+    # <query_i> in parallel.
+    pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
+    try:
+        plpy.execute(pca_union_call)
+    except Exception as e:
+        ## drop the output tables that were created if PCA errored out.
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            """.format(out_table))
+        if result_summary_table:
+            plpy.execute("""
+                    DROP TABLE IF EXISTS {0}
+                """.format(result_summary_table))
+        if residual_table:
+            plpy.execute("""
+                    DROP TABLE IF EXISTS {0}
+                """.format(residual_table))
+        plpy.error(str(e) + "\n" + str(e.args) + "\n" + str(e.strerror))
+        raise
+
+    plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+
+def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
+        row_id, original_row_id, grouping_cols, grouping_cols_clause,
+        residual_table, result_summary_table, grp_id, grouping_where_clause,
+        select_grouping_cols, grouping_cols_values, temp_source_table_columns,
+        temp_pc_table_columns, is_sparse, col_id, val_id, row_dim, col_dim, **kwargs):
+    """
+    The pca_project is performed over each group, if any.
+
+    Args:
+        @param schema_madlib -- madlib schema name
+        @param source_table -- Source table name (dense matrix)
+        @param pc_table -- Output table name for the principal components
+        @param out_table -- Output table name
+        @param row_id -- Column name for the ID for each row
+        @param original_row_id  -- copy of the row_id originally passed
+        @param grouping_cols -- Comma-separated list of grouping columns (Default: NULL)
+        @param grouping_cols_clause -- Part of the SQL query to be used with grouping_cols
+        @param residual_table -- Residual table name
+        @param result_summary_table -- Table name to store summary of results (Default: NULL)
+        @param grp_id -- a place holder id for each group
+        @param grouping_where_clause -- WHERE clause using grouping_cols
+        @param select_grouping_cols -- SELECT clause using grouping_cols
+        @param grouping_cols_values -- distinct values of the grouping_cols
+        @param temp_source_table_columns -- SELECT caluse for creating temporary copy of the source_table
+        @param temp_pc_table_columns -- non grouping_cols of the source_table
+        @param is_sparse -- specifies if the PCA call is for sparse or dense matrices
+        @param col_id -- sparse representation based detail
+        @param val_id -- sparse representation based detail
+        @param row_dim -- sparse representation based detail
+        @param col_dim -- sparse representation based detail
+
+    Returns:
+        None
+    """
+    out_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+    ## Creation of this temp table is unnecessary if the scenario does not involve
+    ## grouping, and/or, the input table had perfect values for the row_id column.
+    ## This temp table will ensure pca works even when row_id of the source_table
+    ## does not have serially increasing numbers starting from 1;
+    source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+    plpy.execute("""
+                CREATE TEMP TABLE {source_table_grouped} AS
+                SELECT {temp_source_table_columns}
+                FROM {source_table}
+                {grouping_where_clause}
+            """.format(source_table_grouped=source_table_grouped,
+                source_table=source_table, grouping_where_clause=grouping_where_clause,
+                temp_source_table_columns=temp_source_table_columns))
+    if grouping_cols:
+        pc_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+        plpy.execute("""
+                CREATE TEMP TABLE {pc_table_grouped} AS
+                SELECT {temp_pc_table_columns}
+                FROM {pc_table}
+                {grouping_where_clause}
+            """.format(pc_table_grouped=pc_table_grouped,
+                pc_table=pc_table, grouping_where_clause=grouping_where_clause,
+                temp_pc_table_columns=temp_pc_table_columns))
+    else:
+        pc_table_grouped = pc_table
 
+    t0 = time.time()  # measure the starting time
     # Step 1: Validate the input arguments
-    _validate_args(schema_madlib, source_table, pc_table, out_table,
-                   row_id, None, None, None, None,
-                   residual_table, result_summary_table)
+    if is_sparse:
+        _validate_args(schema_madlib, source_table_grouped, pc_table, row_id,
+                       col_id, val_id, row_dim, col_dim)
+        # Step 1.1: Create a copy of the sparse matrix and add row_dims and col_dims
+        # Warning: This changes the column names of the table
+        sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
+        create_temp_sparse_matrix_table_with_dims(source_table_grouped, sparse_table_copy,
+                                                  row_id, col_id, val_id,
+                                                  row_dim, col_dim)
+
+        # Step 1.2: Densify the input matrix
+        x_dense = "pg_temp." + unique_string() + "_dense"
+        plpy.execute("""
+            SELECT {schema_madlib}.matrix_densify(
+                '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
+                '{x_dense}', 'row=row_id, col=col_id,val=row_vec')
+            """.format(schema_madlib=schema_madlib,
+                sparse_table_copy=sparse_table_copy, row_id=row_id,
+                col_id=col_id, val_id=val_id, x_dense=x_dense))
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            """.format(sparse_table_copy))
+        source_table_grouped = x_dense
+        row_id = 'row_id'
+    else:
+        # Preserve the mapping between new row_id created and the original row_id. This is
+        # required only for dense input format.
+        temp_row_id = original_row_id + unique_string()
+        row_id_map_table = "rowid" + unique_string()
+        plpy.execute("""
+                CREATE TEMP TABLE {row_id_map_table} AS
+                SELECT
+                    {original_row_id} AS {temp_row_id},
+                    {select_clause}
+                FROM {source_table}
+                {grouping_where_clause}
+            """.format(row_id_map_table=row_id_map_table,
+                original_row_id=original_row_id,
+                temp_row_id=temp_row_id,
+                source_table=source_table,
+                select_clause="""
+                    ROW_NUMBER() OVER({partition_over}) AS row_id
+                """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)),
+                grouping_where_clause=grouping_where_clause))
+        ## Validate the arguments
+        _validate_args(schema_madlib, source_table_grouped, pc_table,
+                       row_id, None, None, None, None)
 
     # Make sure that the table has row_id and row_vec
     source_table_copy = "pg_temp." + unique_string()
     need_new_column_names = cast_dense_input_table_to_correct_columns(
-        schema_madlib, source_table, source_table_copy, row_id)
+        schema_madlib, source_table_grouped, source_table_copy, row_id)
 
     if(need_new_column_names):
-        source_table = source_table_copy
+        source_table_grouped = source_table_copy
 
-    [row_dim, col_dim] = get_dims(source_table,
+    [row_dim, col_dim] = get_dims(source_table_grouped,
                                   {'row': 'row_id', 'col': 'col_id',
                                    'val': 'row_vec'})
-    validate_dense(source_table,
+    validate_dense(source_table_grouped,
                    {'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'},
                    check_col=False, row_dim=row_dim)
 
@@ -426,49 +804,57 @@ def pca_project(schema_madlib,
                 row_id,
                 ({schema_madlib}.utils_normalize_data(
                                   row_vec,
-                                  (select column_mean from {pc_table_mean}),
+                                  (select column_mean from {pc_table_mean}
+                                  {grouping_where_clause}),
                                   '{x_std_str}'::double precision[]))
                     AS row_vec
-            FROM {source_table}
+            FROM {source_table_grouped}
         """.format(schema_madlib=schema_madlib,
                    pc_table_mean=pc_table_mean,
-                   source_table=source_table,
+                   source_table_grouped=source_table_grouped,
                    scaled_source_table=scaled_source_table,
+                   grouping_where_clause=grouping_where_clause,
                    x_std_str=x_std_str))
 
     plpy.execute(
         """
         SELECT {schema_madlib}.matrix_mult('{scaled_source_table}',
                                             'trans=false,row=row_id, col=col_id, val=row_vec',
-                                           '{pc_table}',
+                                           '{pc_table_grouped}',
                                             'trans=TRUE, row=row_id, col=col_id, val=principal_components',
-                                            '{out_table}',
+                                            '{out_table_grouped}',
                                             'row=row_id, col=col_id,val=row_vec');
         """.format(schema_madlib=schema_madlib,
                    scaled_source_table=scaled_source_table,
-                   pc_table=pc_table,
-                   out_table=out_table))
+                   pc_table_grouped=pc_table_grouped,
+                   out_table_grouped=out_table_grouped))
 
     # Step 3: Compute the Residual table (if required)
     # Residual table: res = mat - proj
+    create_residual_table = False
     if residual_table or result_summary_table:
+        if grouping_cols:
+            residual_table_grouped = "pg_temp." + unique_string() + "_temp_residual"
+        else:
+            residual_table_grouped = residual_table
         create_temp_residual_table = False
         if not residual_table:
             create_temp_residual_table = True
-            residual_table = "pg_temp." + unique_string() + "_temp_residual"
+        else:
+            create_residual_table = True
         approx_table = "pg_temp." + unique_string() + "_approx"
         # Build an approximate reconstruction of the data
         plpy.execute(
             """
-            SELECT {schema_madlib}.matrix_mult('{out_table}',
+            SELECT {schema_madlib}.matrix_mult('{out_table_grouped}',
                                                 'row=row_id, col=col_id, val=row_vec',
-                                               '{pc_table}',
+                                               '{pc_table_grouped}',
                                                 'row=row_id, col=col_id, val=principal_components',
                                                 '{approx_table}',
                                                 'row=row_id, col=col_id, val=row_vec');
             """.format(schema_madlib=schema_madlib,
-                       out_table=out_table,
-                       pc_table=pc_table,
+                       out_table_grouped=out_table_grouped,
+                       pc_table_grouped=pc_table_grouped,
                        approx_table=approx_table))
 
         # Compute the difference between the reconstruction and real data
@@ -481,12 +867,12 @@ def pca_project(schema_madlib,
                                         '{approx_table}',
                                         'row=row_id, col=col_id, val=row_vec',
                                         -1,
-                                        '{residual_table}',
+                                        '{residual_table_grouped}',
                                         'row=row_id, col=col_id, val=row_vec');
             """.format(schema_madlib=schema_madlib,
                        scaled_source_table=scaled_source_table,
                        approx_table=approx_table,
-                       residual_table=residual_table))
+                       residual_table_grouped=residual_table_grouped))
 
         # Step 4: Compute the results summary table (if required)
         # If the residual table is not asked by the user, but he does ask for
@@ -494,19 +880,19 @@ def pca_project(schema_madlib,
         if result_summary_table:
             source_table_norm = plpy.execute(
                 """
-                SELECT {schema_madlib}.matrix_norm('{source_table}',
+                SELECT {schema_madlib}.matrix_norm('{source_table_grouped}',
                                                    'row=row_id, col=col_id, val=row_vec') as r
                 """.format(schema_madlib=schema_madlib,
-                           source_table=source_table,
+                           source_table_grouped=source_table_grouped,
                            row_id=row_id))[0]['r']
 
             # Compute the norm of the residual table
             residual_norm = plpy.execute(
                 """
-                SELECT {schema_madlib}.matrix_norm('{residual_table_name}',
+                SELECT {schema_madlib}.matrix_norm('{residual_table_grouped}',
                                                    'row=row_id, col=col_id, val=row_vec') as r
                 """.format(schema_madlib=schema_madlib,
-                           residual_table_name=residual_table,
+                           residual_table_grouped=residual_table_grouped,
                            row_id=row_id))[0]['r']
             # Compute the relative error of the norm
             # Prevent division by zero
@@ -514,12 +900,6 @@ def pca_project(schema_madlib,
                 relative_residual_norm = residual_norm / source_table_norm
             else:
                 relative_residual_norm = 0
-            plpy.execute(
-                """
-                CREATE TABLE {result_summary_table} ( exec_time FLOAT8,
-                                                      residual_norm FLOAT8,
-                                                      relative_residual_norm FLOAT8);
-                """.format(result_summary_table=result_summary_table))
             # Compute the time in milli-seconds
             t1 = time.time()
             dt = (t1 - t0) * 1000.
@@ -529,19 +909,87 @@ def pca_project(schema_madlib,
                 INSERT INTO {result_summary_table} VALUES
                     ({dt},
                      {residual_norm}::double precision,
-                     {relative_residual_norm}::double precision);
-                """.format(dt=dt,
-                           residual_norm=residual_norm,
+                     {relative_residual_norm}::double precision
+                     {grouping_cols_values}
+                     );
+                """.format(dt=dt, residual_norm=residual_norm,
                            result_summary_table=result_summary_table,
-                           relative_residual_norm=relative_residual_norm))
+                           relative_residual_norm=relative_residual_norm,
+                           grouping_cols_values=grouping_cols_values))
 
             plpy.execute("""
                      DROP TABLE IF EXISTS {approx_table};
                     """.format(approx_table=approx_table))
             if create_temp_residual_table:
                 plpy.execute("""
-                     DROP TABLE IF EXISTS {residual_table};
-                     """.format(residual_table=residual_table))
+                     DROP TABLE IF EXISTS {0};
+                     """.format(residual_table_grouped))
+
+    if is_sparse:
+        ## We don't have to join based on row_id for sparse project.
+        if create_residual_table:
+            if grouping_cols:
+                plpy.execute("""
+                        INSERT INTO {residual_table}
+                        SELECT * {select_grouping_cols}
+                        FROM {residual_table_grouped}
+                    """.format(residual_table=residual_table,
+                        select_grouping_cols=select_grouping_cols,
+                        residual_table_grouped=residual_table_grouped))
+        plpy.execute("""
+                INSERT INTO {out_table}
+                SELECT * {select_grouping_cols}
+                FROM {out_table_grouped}
+            """.format(out_table=out_table,
+                    select_grouping_cols=select_grouping_cols,
+                    out_table_grouped=out_table_grouped))
+    else:
+        output_table_cols = get_cols(out_table_grouped)
+        output_table_cols.remove('row_id')
+        output_table_select_clause = """{row_id_map_table}.{temp_row_id},
+                {out_table_cols}
+                {select_grouping_cols}
+            """.format(row_id_map_table=row_id_map_table,
+                temp_row_id=temp_row_id,
+                out_table_cols=', '.join(output_table_cols),
+                select_grouping_cols=select_grouping_cols)
+        if create_residual_table:
+            if grouping_cols:
+                plpy.execute("""
+                    INSERT INTO {residual_table}
+                    SELECT {select_clause}
+                    FROM {residual_table_grouped}
+                    INNER JOIN {row_id_map_table}
+                    ON {row_id_map_table}.row_id={residual_table_grouped}.row_id
+                    """.format(residual_table=residual_table,
+                        select_clause=output_table_select_clause,
+                        residual_table_grouped=residual_table_grouped,
+                        row_id_map_table=row_id_map_table))
+                plpy.execute("""
+                        DROP TABLE IF EXISTS {0}
+                    """.format(residual_table_grouped))
+
+        plpy.execute("""
+                    INSERT INTO {out_table}
+                    SELECT {select_clause}
+                    FROM {out_table_grouped}
+                    INNER JOIN {row_id_map_table}
+                    ON {row_id_map_table}.row_id={out_table_grouped}.row_id
+                    """.format(out_table=out_table,
+                        select_clause=output_table_select_clause,
+                        out_table_grouped=out_table_grouped,
+                        row_id_map_table=row_id_map_table))
+        plpy.execute("""
+                DROP TABLE IF EXISTS {0};
+            """.format(row_id_map_table))
+    plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+            DROP TABLE IF EXISTS {1};
+            DROP TABLE IF EXISTS {2};
+        """.format(scaled_source_table,
+            source_table_grouped, out_table_grouped))
+    if grouping_cols:
+        plpy.execute("""
+            DROP TABLE IF EXISTS {0};
+        """.format(pc_table_grouped))
 
-    plpy.execute("DROP TABLE IF EXISTS {0}".format(scaled_source_table))
-    plpy.execute("SET client_min_messages TO %s" % old_msg_level)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.sql_in b/src/ports/postgres/modules/pca/pca_project.sql_in
index fde15d0..814292c 100644
--- a/src/ports/postgres/modules/pca/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/pca_project.sql_in
@@ -259,6 +259,219 @@ SELECT * FROM residual_table ORDER BY row_id;
       5 | {-0.0333376637524658,-0.455268589780183,1.55376831915842}
       6 | {0.00277445701511336,0.037888771752409,-0.129309103509956}
 </pre>
+-# Run the PCA project function on subsets of an input table based on grouping columns.
+Note that the 'pc_table' used for projection must also be generated using the same
+grouping columns. Create sample data:
+<pre class="example">
+DROP TABLE IF EXISTS mat_with_grouping;
+CREATE TABLE mat_with_grouping (
+    row_id integer,
+    row_vec double precision[],
+    matrix_id integer
+);
+COPY mat_with_grouping (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
+1|{1,2,3}|1
+2|{2,1,2}|1
+3|{3,2,1}|1
+4|{1,2,3,4,5}|2
+5|{2,1,2,4,5}|2
+6|{3,2,1,4,5}|2
+7|{1,2,4,4,5}|2
+8|{1,3,3,4,5}|2
+9|{2,2,2,4,5}|2
+10|{3,3,2,4,5}|2
+11|{2,2,3,4,5}|2
+\.
+</pre>
+-# Run the PCA function using grouping_cols and capture 90% of the proportion of variance:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_grp;
+DROP TABLE IF EXISTS result_table_grp_mean;
+SELECT madlib.pca_train( 'mat_with_grouping',
+          'result_table_grp',
+          'row_id',
+          0.9,
+          'matrix_id'
+    );
+</pre>
+-# Project the original data to low-dimensional representation, based on independent PCA models
+learnt for each group:
+<pre class="example">
+DROP TABLE IF EXISTS mat_with_grouping_projected;
+SELECT madlib.pca_project(
+    'mat_with_grouping',
+    'result_table_grp',
+    'mat_with_grouping_projected',
+    'row_id'
+    );
+</pre>
+-# Check the error in the projection:
+<pre class="example">
+SELECT * FROM mat_with_grouping_projected;
+</pre>
+<pre class="result">
+ row_id |                row_vec                 | matrix_id
+--------+----------------------------------------+-----------
+      1 | {1.41421356237309,0.33333333333}       |         1
+      2 | {1.08709337828423e-16,-0.66666666667}  |         1
+      3 | {-1.41421356237309,0.33333333333}      |         1
+      4 | {-0.947334292839873,0.757198329919689} |         2
+      5 | {0.503945652182431,-0.357198389962114} |         2
+      6 | {1.87289054234556,-1.63346278982675}   |         2
+      7 | {-1.69764931050669,1.55408536983517}   |         2
+      8 | {-0.98850182026946,0.676264489928273}  |         2
+      9 | {0.462778124752844,-0.43813222995353}  |         2
+     10 | {1.08140799724916,-0.917509589902686}  |         2
+     11 | {-0.287536892913973,0.358754809961949} |         2
+</pre>
+
+-# Create sample data in sparse matrix form:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+                        row_id integer,
+                        col_id integer,
+                        val_id integer
+                        );
+INSERT INTO sparse_mat VALUES
+(1, 2, 4.0),
+(1, 5, 6.0),
+(3, 8, 4.0),
+(5, 4, 2.0),
+(6, 6, 12.0),
+(8, 1, 2.0),
+(8, 7, 2.0),
+(9, 3, 4.0),
+(9, 8, 2.0);
+</pre>
+-# Run the PCA sparse function for a specified number of principal components:
+<pre class="example">
+DROP TABLE IF EXISTS result_table, result_table_mean;
+SELECT madlib.pca_sparse_train(
+                    'sparse_mat',
+                    'result_table',
+                    'row_id',
+                    'col_id',
+                    'val_id',
+                    9,
+                    8,
+                    5);
+</pre>
+-# Project the original sparse data to low-dimensional representation:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat_out;
+SELECT madlib.pca_sparse_project(
+                    'sparse_mat',
+                    'result_table',
+                    'sparse_mat_out',
+                    'row_id',
+                    'col_id',
+                    'val_id',
+                    9,
+                    8
+                    );
+</pre>
+-# Check the error in the projection:
+<pre class="example">
+SELECT * FROM sparse_mat_out ORDER BY row_id;
+</pre>
+<pre class="result">
+ row_id |                                              row_vec
+--------+----------------------------------------------------------------------------------------------------
+      1 | {1.34695183995672,0.879135043008323,1.87818383323404,-2.82842712474619,1.24074659599017}
+      2 | {1.63437198343112,-5.32859025290785,-0.240160697276225,-2.36847578586602e-15,0.295172631209684}
+      3 | {1.43909964102331,1.28442554799452,-2.99624380589971,-7.21876262469717e-15,1.06626861836748}
+      4 | {1.21839219844249,0.517445462630873,0.298488396687886,2.88426689939176e-15,-1.34560049142698}
+      5 | {-10.7325138823598,-0.136175420017185,-0.0338708165820332,8.58109879450845e-16,0.0577033753567249}
+      6 | {1.21839219844249,0.517445462630873,0.298488396687885,2.1626219333854e-15,-1.34560049142698}
+      7 | {1.21839219844249,0.517445462630873,0.298488396687885,1.91282175284474e-15,-1.34560049142698}
+      8 | {1.34695183995672,0.879135043008322,1.87818383323403,2.82842712474619,1.24074659599017}
+      9 | {1.30996198263469,0.86973365102307,-1.38155753676828,-2.88889282865906e-15,0.136163657369328}
+</pre>
+-# Create sample data in sparse matrix form, with a grouping column:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat_with_grouping;
+CREATE TABLE sparse_mat_with_grouping (
+    row_id integer,
+    col_id integer,
+    val_id integer,
+    matrix_id integer
+);
+COPY sparse_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|1
+8|1|2|1
+8|7|2|1
+9|3|4|1
+9|8|2|1
+1|2|4|2
+1|5|6|2
+6|6|12|2
+8|1|2|2
+8|7|2|2
+\.
+</pre>
+-#  Run the PCA sparse function using grouping_cols and capture 90% of the proportion of variance:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_grouped, result_table_grouped_mean;
+SELECT madlib.pca_sparse_train (
+        'sparse_mat_with_grouping',
+        'result_table_grouped',
+        'row_id',
+        'col_id',
+        'val_id',
+        10,
+        10,
+        0.9,
+        'matrix_id'
+    );
+</pre>
+-# Project the original sparse data to low-dimensional representation, based on independent PCA models
+learnt for each group:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat_with_grouping_projected;
+SELECT madlib.pca_sparse_project(
+    'sparse_mat_with_grouping',
+    'result_table_grouped',
+    'sparse_mat_with_grouping_projected',
+    'row_id',
+    'col_id',
+    'val_id',
+    10,
+    10
+    );
+</pre>
+-# Check the error in the projection:
+<pre class="example">
+SELECT * FROM sparse_mat_with_grouping_projected ORDER BY matrix_id,row_id;
+</pre>
+<pre class="result">
+ row_id |                                     row_vec                                     | matrix_id
+--------+---------------------------------------------------------------------------------+-----------
+      1 | {1.22692546452761,0.818988440128327,1.81591522940788,2.82842712474619}          |         1
+      2 | {1.4852284082451,-5.38921614355968,-0.238413962586454,-2.76195922299053e-16}    |         1
+      3 | {1.30995313897398,1.19175698765945,-3.05860526896832,-2.63021567919958e-15}     |         1
+      4 | {1.11099454391467,0.484270184421277,0.294314185921038,-1.61822162787753e-16}    |         1
+      5 | {-10.8525988405155,-0.128517253667187,-0.0335370457586047,8.53109060561845e-16} |         1
+      6 | {1.11099454391467,0.484270184421277,0.294314185921038,2.9799754824927e-16}      |         1
+      7 | {1.11099454391467,0.484270184421277,0.294314185921038,-8.08358590285308e-16}    |         1
+      8 | {1.22692546452761,0.818988440128326,1.81591522940789,-2.82842712474619}         |         1
+      9 | {1.19351195715706,0.808981987902128,-1.41427400274461,-1.38815367040019e-15}    |         1
+     10 | {1.07707077534015,0.42620698814481,0.230057263479109,-1.46091661600792e-16}     |         1
+      1 | {-1.28753936783518,4.42689855953263}                                            |         2
+      2 | {-1.56033687316063,0.102219682546935}                                           |         2
+      3 | {10.8262428615875,-0.605455042777997}                                           |         2
+      4 | {-1.16530001926412,-0.290932942633583}                                          |         2
+      5 | {-1.16530001926412,-0.605455042777997}                                          |         2
+      6 | {-1.12955331641269,-0.605455042777998}                                          |         2
+      7 | {-1.12955331641269,-0.605455042777998}                                          |         2
+      8 | {-1.12955331641269,-0.605455042777998}                                          |         2
+      9 | {-1.12955331641269,-0.605455042777998}                                          |         2
+     10 | {-1.12955331641269,-0.605455042777998}                                          |         2
+</pre>
 
 @anchor notes
 @par Notes
@@ -333,7 +546,7 @@ MADLIB_SCHEMA.pca_project(
     out_table               TEXT,    -- Output table name for the principal components
     row_id                  TEXT,    -- Column name for the ID for each row
     residual_table          TEXT,    -- Residual table (Default: NULL)
-    result_summary_table    TEXT     -- Table name to store summary of results (Default: NULL)
+    result_summary_table    TEXT    -- Table name to store summary of results (Default: NULL)
 )
 RETURNS VOID AS $$
 PythonFunction(pca, pca_project, pca_project)
@@ -457,6 +670,33 @@ $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_project_union(
+    source_table                TEXT,    -- Source table name (dense matrix)
+    pc_table                    TEXT,    -- Output table name for the principal components
+    out_table                   TEXT,    -- Output table name
+    row_id                      TEXT,    -- Column name for the ID for each row
+    original_row_id             TEXT,    -- copy of the row_id originally passed
+    grouping_cols               TEXT,    -- Comma-separated list of grouping columns (Default: NULL)
+    grouping_cols_clause        TEXT,    -- Part of the SQL query to be used with grouping_cols
+    residual_table              TEXT,    -- Residual table name
+    result_summary_table        TEXT,    -- Table name to store summary of results (Default: NULL)
+    grp_id                      INTEGER, -- a place holder id for each group
+    grouping_where_clause       TEXT,    -- WHERE clause using grouping_cols
+    select_grouping_cols        TEXT,    -- SELECT clause using grouping_cols
+    grouping_cols_values        TEXT,    -- distinct values of the grouping_cols
+    temp_source_table_columns   TEXT,    -- SELECT caluse for creating temporary copy of the source_table
+    temp_pc_table_columns       TEXT,    -- non grouping_cols of the source_table
+    is_sparse                   BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
+    col_id                      TEXT,    -- sparse representation based detail
+    val_id                      TEXT,    -- sparse representation based detail
+    row_dim                     INTEGER, -- sparse representation based detail
+    col_dim                     INTEGER  -- sparse representation based detail
+)
+RETURNS VOID AS $$
+PythonFunction(pca, pca_project, _pca_project_union)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
 
 -- Help and usage functions
 -----------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/test/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/test/pca.sql_in b/src/ports/postgres/modules/pca/test/pca.sql_in
index 8cff3db..20093ac 100644
--- a/src/ports/postgres/modules/pca/test/pca.sql_in
+++ b/src/ports/postgres/modules/pca/test/pca.sql_in
@@ -118,6 +118,86 @@ NULL, 5, FALSE, 'result_table_214712398172490838');
 select * from result_table_214712398172490837;
 select * from result_table_214712398172490838;
 
+-- Test dense data with grouping
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+    id integer,
+    row_vec double precision[],
+    grp integer
+);
+
+COPY mat (id, row_vec, grp) FROM stdin delimiter '|';
+1|{396,840,353,446,318,886,15,584,159,383}|1
+2|{691,58,899,163,159,533,604,582,269,390}|1
+3|{293,742,298,75,404,857,941,662,846,2}|1
+4|{462,532,787,265,982,306,600,608,212,885}|1
+5|{304,151,337,387,643,753,603,531,459,652}|1
+6|{327,946,368,943,7,516,272,24,591,204}|1
+7|{877,59,260,302,891,498,710,286,864,675}|1
+8|{458,959,774,376,228,354,300,669,718,565}|2
+9|{824,390,818,844,180,943,424,520,65,913}|2
+10|{882,761,398,688,761,405,125,484,222,873}|2
+11|{528,1,860,18,814,242,314,965,935,809}|2
+12|{492,220,576,289,321,261,173,1,44,241}|2
+13|{415,701,221,503,67,393,479,218,219,916}|2
+14|{350,192,211,633,53,783,30,444,176,932}|2
+15|{909,472,871,695,930,455,398,893,693,838}|2
+16|{739,651,678,577,273,935,661,47,373,618}|2
+\.
+
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 0.8,
+'grp', 5, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
+-- Matrix in the column format
+DROP TABLE IF EXISTS cmat;
+CREATE TABLE cmat (
+    id  integer,
+    val0    float8,
+    val1    float8,
+    val2    float8,
+    val3    float8,
+    val4    float8,
+    val5    float8,
+    val6    float8,
+    val7    float8,
+    val8    float8,
+    val9    float8,
+    grp     integer
+);
+
+COPY cmat (id, val0, val1, val2, val3, val4, val5, val6, val7, val8, val9, grp) FROM stdin delimiter '|';
+1|396|840|353|446|318|886|15|584|159|383|1
+2|691|58|899|163|159|533|604|582|269|390|1
+3|293|742|298|75|404|857|941|662|846|2|1
+4|462|532|787|265|982|306|600|608|212|885|1
+5|304|151|337|387|643|753|603|531|459|652|1
+6|327|946|368|943|7|516|272|24|591|204|1
+7|877|59|260|302|891|498|710|286|864|675|2
+8|458|959|774|376|228|354|300|669|718|565|2
+9|824|390|818|844|180|943|424|520|65|913|2
+10|882|761|398|688|761|405|125|484|222|873|2
+11|528|1|860|18|814|242|314|965|935|809|2
+12|492|220|576|289|321|261|173|1|44|241|2
+13|415|701|221|503|67|393|479|218|219|916|2
+14|350|192|211|633|53|783|30|444|176|932|2
+15|909|472|871|695|930|455|398|893|693|838|2
+16|739|651|678|577|273|935|661|47|373|618|2
+\.
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 0.8,
+'grp', 5, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
 -- SPARSE PCA: Make sure all possible default calls for sparse PCA work
 -----------------------------------------------------------------------------
 
@@ -228,9 +308,37 @@ select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
 select * from result_table_214712398172490837;
 select * from result_table_214712398172490838;
 
+-- Sparse input data with grouping column
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+    id integer,
+    col_id integer,
+    val_id integer,
+    grp    integer
+);
+COPY sparse_mat (id, col_id, val_id, grp) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|2
+8|7|2|2
+8|1|2|2
+9|8|2|2
+9|3|4|2
+\.
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
+'id', 'col_id', 'val_id', 10, 10, 0.8, 'grp', 0, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
 -------------------------------------------------------------------------
 -- test a different column name
-alter table sparse_mat rename column row_id to rownr;
+alter table sparse_mat rename column id to rownr;
 alter table sparse_mat rename column col_id to colnr;
 alter table sparse_mat rename column val_id to vals;
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/test/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/test/pca_project.sql_in b/src/ports/postgres/modules/pca/test/pca_project.sql_in
index 82c547a..beb0ef4 100644
--- a/src/ports/postgres/modules/pca/test/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/test/pca_project.sql_in
@@ -30,8 +30,6 @@ drop table if exists result_table_214712398172490837;
 drop table if exists result_table_214712398172490837_mean;
 select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
 'row_id', 'col_id', 'val_id', 10, 10, 10);
-select * from result_table_214712398172490837;
-
 
 drop table if exists out_table_214712398172490837;
 select pca_sparse_project( 'sparse_mat',
@@ -120,6 +118,7 @@ select pca_project( 'mat',
 
 drop table if exists out_table_214712398172490837;
 drop table if exists residual_table_214712398172490837;
+drop table if exists result_summary_table_214712398172490837;
 select pca_project( 'mat',
     'result_table_214712398172490837',
     'out_table_214712398172490837',
@@ -172,4 +171,129 @@ select pca_project( 'cmat',
                     'out_table_214712398172490837',
                     'row_id');
 
+-- Sparse matrix with grouping column
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+    id integer,
+    col_id integer,
+    val_id integer,
+    grp    integer
+);
+COPY sparse_mat (id, col_id, val_id, grp) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|2
+8|7|2|2
+8|1|2|2
+9|8|2|2
+9|3|4|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
+'id', 'col_id', 'val_id', 10, 10, 10, 'grp');
+
+drop table if exists out_table_214712398172490837;
+drop table if exists residual_table_214712398172490837;
+drop table if exists summary_table_214712398172490837;
+select pca_sparse_project( 'sparse_mat',
+                    'result_table_214712398172490837',
+                    'out_table_214712398172490837',
+                    'id',
+                    'col_id',
+                    'val_id',
+                    10,
+                    10,
+                    'residual_table_214712398172490837',
+                    'summary_table_214712398172490837');
+
+-- Test data (Indentical to SVD) with grouping column
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+    id integer,
+    row_vec double precision[],
+    grp integer
+);
+
+COPY mat (id, row_vec, grp) FROM stdin delimiter '|';
+1|{396,840,353,446,318,886,15,584,159,383}|1
+2|{691,58,899,163,159,533,604,582,269,390}|1
+3|{293,742,298,75,404,857,941,662,846,2}|1
+4|{462,532,787,265,982,306,600,608,212,885}|1
+5|{304,151,337,387,643,753,603,531,459,652}|1
+6|{327,946,368,943,7,516,272,24,591,204}|1
+7|{877,59,260,302,891,498,710,286,864,675}|1
+8|{458,959,774,376,228,354,300,669,718,565}|2
+9|{824,390,818,844,180,943,424,520,65,913}|2
+10|{882,761,398,688,761,405,125,484,222,873}|2
+11|{528,1,860,18,814,242,314,965,935,809}|2
+12|{492,220,576,289,321,261,173,1,44,241}|2
+13|{415,701,221,503,67,393,479,218,219,916}|2
+14|{350,192,211,633,53,783,30,444,176,932}|2
+15|{909,472,871,695,930,455,398,893,693,838}|2
+16|{739,651,678,577,273,935,661,47,373,618}|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 5, 'grp');
+
+drop table if exists out_table_214712398172490837;
+drop table if exists residual_table_214712398172490837;
+drop table if exists result_summary_table_214712398172490837;
+select pca_project( 'mat',
+    'result_table_214712398172490837',
+    'out_table_214712398172490837',
+    'id',
+    'residual_table_214712398172490837',
+    'result_summary_table_214712398172490837');
+
+
+-- Matrix in the column format with grouping column
+DROP TABLE IF EXISTS cmat;
+CREATE TABLE cmat (
+    id  integer,
+    val0    float8,
+    val1    float8,
+    val2    float8,
+    val3    float8,
+    val4    float8,
+    val5    float8,
+    val6    float8,
+    val7    float8,
+    val8    float8,
+    val9    float8,
+    grp     integer
+);
+
+COPY cmat (id, val0, val1, val2, val3, val4, val5, val6, val7, val8, val9, grp) FROM stdin delimiter '|';
+1|396|840|353|446|318|886|15|584|159|383|1
+2|691|58|899|163|159|533|604|582|269|390|1
+3|293|742|298|75|404|857|941|662|846|2|1
+4|462|532|787|265|982|306|600|608|212|885|1
+5|304|151|337|387|643|753|603|531|459|652|1
+6|327|946|368|943|7|516|272|24|591|204|1
+7|877|59|260|302|891|498|710|286|864|675|2
+8|458|959|774|376|228|354|300|669|718|565|2
+9|824|390|818|844|180|943|424|520|65|913|2
+10|882|761|398|688|761|405|125|484|222|873|2
+11|528|1|860|18|814|242|314|965|935|809|2
+12|492|220|576|289|321|261|173|1|44|241|2
+13|415|701|221|503|67|393|479|218|219|916|2
+14|350|192|211|633|53|783|30|444|176|932|2
+15|909|472|871|695|930|455|398|893|693|838|2
+16|739|651|678|577|273|935|661|47|373|618|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_train('cmat', 'result_table_214712398172490837', 'id', 5, 'grp');
 
+drop table if exists out_table_214712398172490837;
+select pca_project( 'cmat',
+                    'result_table_214712398172490837',
+                    'out_table_214712398172490837',
+                    'id');