You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2017/01/19 19:53:30 UTC
[3/6] incubator-madlib git commit: PCA: Add grouping support to PCA
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/pca_project.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.py_in b/src/ports/postgres/modules/pca/pca_project.py_in
index 98d9e3f..1e61d3c 100644
--- a/src/ports/postgres/modules/pca/pca_project.py_in
+++ b/src/ports/postgres/modules/pca/pca_project.py_in
@@ -19,6 +19,8 @@ from utilities.utilities import _array_to_string
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_exists
from utilities.utilities import add_postfix
+from utilities.validate_args import get_cols, get_cols_and_types
+from utilities.control import MinWarning
version_wrapper = __mad_version()
@@ -27,69 +29,149 @@ array_to_string = version_wrapper.select_vec_return()
ZERO_THRESHOLD = 1e-6
-# Dense PCA help function
+# Dense PCA project help function
def pca_project_help(schema_madlib, usage_string=None, **kwargs):
"""
Given a usage string, give out function usage information.
"""
- if usage_string is None:
- usage_string = ''
-
- if (usage_string.lower() == "usage"):
+ if usage_string is not None and \
+ usage_string.lower() in ("usage", "help", "?"):
return """
- ----------------------------------------------------------------
- Usage
- ----------------------------------------------------------------
- SELECT {schema_madlib}.pca_project (
- 'tbl_source', -- Data table
- 'tbl_pc', -- Table with principal componenents
- (obtained as output from pca_train)
- 'tbl_result', -- Result table
- 'row_id', -- Name of the column containing the row_id
- -- Optional Parameters
- ----------------------------------------------------------------
- 'tbl_residual', -- Residual table (Default: NULL)
- 'tbl_result_summary', -- Result summary table (Default : NULL)
- );
-
- Output Tables
- --------------------------------------------------------------------
- The output is divided into three tables (two of which are optional)
-
- --------------------------------------------------------------------
- The output table ('tbl_result' above) encodes a dense matrix
- with the projection onto the principal components. The matrix contains
- the following columns:
-
- 'row_id' INTEGER, -- Row id of the output matrix
- 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
- --------------------------------------------------------------------
- The residual table ('tbl_residual' above) encodes a dense residual
- matrix which has the following columns
-
- 'row_id' INTEGER, -- Row id of the output matrix
- 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
- --------------------------------------------------------------------
- The result summary table ('tbl_result_summary' above) has the following columns
-
- 'exec_time' INTEGER, -- Wall clock time (ms) of the function.
- 'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
- 'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals
- ----------------------------------------------------------------
+----------------------------------------------------------------
+ Usage
+----------------------------------------------------------------
+SELECT {schema_madlib}.pca_project (
+ 'tbl_source', -- Data table
+ 'pc_table', -- Table with principal componenents
+ (obtained as output from pca_train)
+ 'tbl_result', -- Result table
+ 'row_id', -- Name of the column containing the row_id
+ -- Optional Parameters
+ ----------------------------------------------------------------
+ 'tbl_residual', -- Residual table (Default: NULL)
+ 'tbl_result_summary', -- Result summary table (Default : NULL)
+);
+
+Note that if the principal components in pc_table were learnt using
+grouping_cols in {schema_madlib}.pca_train(), the tbl_source used
+here must also have those grouping columns. This will fail otherwise.
+
+ Output Tables
+--------------------------------------------------------------------
+The output is divided into three tables (two of which are optional)
+
+--------------------------------------------------------------------
+ The output table ('tbl_result' above) encodes a dense matrix
+ with the projection onto the principal components. The matrix contains
+ the following columns:
+
+ 'row_id' INTEGER, -- Row id of the output matrix
+ 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col -- The grouping columns present in the 'pc_table', if any
+
+--------------------------------------------------------------------
+ The residual table ('tbl_residual' above) encodes a dense residual
+ matrix which has the following columns
+
+ 'row_id' INTEGER, -- Row id of the output matrix
+ 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col -- The grouping columns present in the 'pc_table', if any
+
+--------------------------------------------------------------------
+ The result summary table ('tbl_result_summary' above) has the following columns
+
+ 'exec_time' INTEGER, -- Wall clock time (ms) of the function.
+ 'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
+ 'relative_residual_norm' DOUBLE PRECISION, -- Relative error of the residuals
+ grouping_col -- The grouping columns present in the 'pc_table', if any
+----------------------------------------------------------------
""".format(schema_madlib=schema_madlib)
else:
- return """
- ----------------------------------------------------------------
- Summary: PCA Projection
- ----------------------------------------------------------------
- PCA Projection: Projects a dataset to an already trained
- space of principal components.
- --
- For function usage information, run
- SELECT {schema_madlib}.pca_project('usage');
- --
+ if usage_string is not None and \
+ usage_string.lower() in ("example", "examples"):
+ return """
+----------------------------------------------------------------
+ Examples
+----------------------------------------------------------------
+-- Run pca_project() using a model table generated without grouping_cols.
+-- Create input table for pca_project()
+
+DROP TABLE IF EXISTS mat_proj;
+CREATE TABLE mat_proj (
+ row_id integer,
+ row_vec double precision[]
+);
+COPY mat_proj (row_id, row_vec) FROM stdin DELIMITER '|';
+1|{{1,2,3}}
+2|{{2,1,2}}
+3|{{3,2,1}}
+11|{{1,2,3}}
+21|{{2,1,2}}
+31|{{3,2,1}}
+41|{{1,2,4}}
+12|{{1,3,3}}
+\.
+
+-- NOTE: Use the 'result_table' created using the example shown in
+-- {schema_madlib}.pca_train('examples'), as the 'pc_table' parameter here.
+
+DROP TABLE IF EXISTS mat_proj_out;
+SELECT {schema_madlib}.pca_project(
+ 'mat_proj',
+ 'result_table',
+ 'mat_proj_out',
+ 'row_id'
+ );
+
+SELECT * FROM mat_proj_out;
+
+-----------------------------------------------------------------------
+
+-- Run pca_project() using a model table generated with grouping_cols.
+-- Create input table for pca_project(), with grouping
+
+DROP TABLE IF EXISTS mat_proj_grouped;
+CREATE TABLE mat_proj_grouped (
+ row_id integer,
+ row_vec double precision[],
+ matrix_id integer
+);
+COPY mat_proj_grouped (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
+1|{{1,2,3}}|1
+2|{{2,1,2}}|1
+3|{{3,2,1}}|1
+4|{{1,2,3,4,5}}|2
+5|{{2,1,2,4,5}}|2
+6|{{3,2,1,4,5}}|2
+\.
+
+-- NOTE: Use the 'result_table_grp' created using the example shown
+-- in {schema_madlib}.pca_train('examples'), as the 'pc_table' parameter
+-- here. 'result_table_grp' was created with 'matrix_id' as the
+-- grouping column, and the table 'mat_proj_grouped' should also have the
+-- 'matrix_id' column in it.
+
+DROP TABLE IF EXISTS mat_proj_grouped_out;
+SELECT {schema_madlib}.pca_project(
+ 'mat_proj_grouped',
+ 'result_table_grp',
+ 'mat_proj_grouped_out',
+ 'row_id'
+ );
+
+SELECT * FROM mat_proj_grouped_out;
+ """.format(schema_madlib=schema_madlib)
+ else:
+ return """
+----------------------------------------------------------------
+ Summary: PCA Projection
+----------------------------------------------------------------
+PCA Projection: Projects a dataset to an already trained
+space of principal components.
+--
+For function usage information, run
+SELECT {schema_madlib}.pca_project('usage');
+--
""".format(schema_madlib=schema_madlib)
@@ -99,84 +181,204 @@ def pca_sparse_project_help(schema_madlib, usage_string=None, **kwargs):
"""
Given a usage string, give out function usage information.
"""
- if usage_string is None:
- usage_string = ''
-
- if (usage_string.lower() == "usage"):
+ if usage_string is not None and \
+ usage_string.lower() in ("usage", "help", "?"):
return """
- ----------------------------------------------------------------
- Usage
- ----------------------------------------------------------------
- SELECT {schema_madlib}.pca_sparse_project (
- 'tbl_source', -- Data table
- 'tbl_pc', -- Table with principal componenents
- (obtained as output from pca_train)
- 'tbl_result', -- Result table
- 'row_id', -- Name of the column containing the row_id
- 'col_id', -- Name of the column containing the col_id
- 'val_id', -- Name of the column containing the val_id
- 'row_dim' -- Row dimension of the sparse matrix
- 'col_dim' -- Column dimension of the sparse matrix
- -- Optional Parameters
- ----------------------------------------------------------------
- 'tbl_residual', -- Residual table (Default: NULL)
- 'tbl_result_summary', -- Result summary table (Default : NULL)
- );
-
- Output Tables
- ----------------------------------------------------------------
- The output is divided into three tables (two of which are optional)
-
- -----------------------------------------------------------------------------------------
- The output table ('tbl_result' above) encodes a dense matrix
- with the projection onto the principal components. The matrix contains
- the following columns:
-
- 'row_id' INTEGER, -- Row id of the output matrix
- 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
- -----------------------------------------------------------------------------------------
- The residual table ('tbl_residual' above) encodes a dense residual
- matrix which has the following columns
-
- 'row_id' INTEGER, -- Row id of the output matrix
- 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-
- -----------------------------------------------------------------------------------------
- The result summary table ('tbl_result_summary' above) has the following columns
-
- 'exec_time' INTEGER, -- Wall clock time (ms) of the function.
- 'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
- 'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals
- ----------------------------------------------------------------
+----------------------------------------------------------------
+ Usage
+----------------------------------------------------------------
+SELECT {schema_madlib}.pca_sparse_project (
+ 'tbl_source', -- Data table
+ 'pc_table', -- Table with principal componenents
+ (obtained as output from pca_train)
+ 'tbl_result', -- Result table
+ 'row_id', -- Name of the column containing the row_id
+ 'col_id', -- Name of the column containing the col_id
+ 'val_id', -- Name of the column containing the val_id
+ 'row_dim' -- Row dimension of the sparse matrix
+ 'col_dim' -- Column dimension of the sparse matrix
+ -- Optional Parameters
+ ----------------------------------------------------------------
+ 'tbl_residual', -- Residual table (Default: NULL)
+ 'tbl_result_summary', -- Result summary table (Default : NULL)
+);
+
+Note that if the principal components in 'pc_table' were learnt using
+grouping_cols in {schema_madlib}.pca_train(), the tbl_source used
+here must also have those grouping columns. This will fail otherwise.
+
+ Output Tables
+----------------------------------------------------------------
+The output is divided into three tables (two of which are optional)
+
+-----------------------------------------------------------------------------------------
+ The output table ('tbl_result' above) encodes a dense matrix
+ with the projection onto the principal components. The matrix contains
+ the following columns:
+
+ 'row_id' INTEGER, -- Row id of the output matrix
+ 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col -- The grouping columns present in the 'pc_table', if any
+
+-----------------------------------------------------------------------------------------
+ The residual table ('tbl_residual' above) encodes a dense residual
+ matrix which has the following columns
+
+ 'row_id' INTEGER, -- Row id of the output matrix
+ 'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
+ grouping_col -- The grouping columns present in the 'pc_table', if any
+
+-----------------------------------------------------------------------------------------
+ The result summary table ('tbl_result_summary' above) has the following columns
+
+ 'exec_time' INTEGER, -- Wall clock time (ms) of the function.
+ 'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
+ 'relative_residual_norm' DOUBLE PRECISION, -- Relative error of the residuals
+ grouping_col -- The grouping columns present in the 'pc_table', if any
+----------------------------------------------------------------
""".format(schema_madlib=schema_madlib)
else:
- return """
- ----------------------------------------------------------------
- Summary: PCA Projection
- ----------------------------------------------------------------
- PCA Projection: Projects a dataset to an already trained
- space of principal components.
- --
- For function usage information, run
- SELECT {schema_madlib}.pca_sparse_project('usage');
- --
+ if usage_string is not None and \
+ usage_string.lower() in ("example", "examples"):
+ return """
+----------------------------------------------------------------
+ Examples
+----------------------------------------------------------------
+-- Run pca_sparse_project() using a model table generated without grouping_cols.
+-- Create input table for pca_sparse_project()
+
+DROP TABLE IF EXISTS sparse_proj_mat;
+CREATE TABLE sparse_proj_mat (
+ row_id integer,
+ col_id integer,
+ val_id integer
+);
+COPY sparse_proj_mat (row_id, col_id, val_id) FROM stdin delimiter '|';
+1|2|4
+1|5|6
+3|8|4
+8|1|2
+8|7|2
+9|3|4
+9|8|2
+\.
+
+-- NOTE: Use the 'result_table_sparse' created using the example shown in
+-- {schema_madlib}.pca_sparse_train('examples'), as the 'pc_table' parameter here.
+
+SELECT {schema_madlib}.pca_sparse_project(
+ 'sparse_proj_mat',
+ 'result_table_sparse',
+ 'sparse_proj_mat_out',
+ 'row_id',
+ 'col_id',
+ 'val_id',
+ 10,
+ 10
+ );
+
+SELECT * FROM sparse_proj_mat_out;
+
+
+-- Run pca_sparse_project() using a model table generated with grouping_cols.
+-- Create input table for pca_sparse_project(), with grouping
+
+DROP TABLE IF EXISTS sparse_proj_mat_with_grouping;
+CREATE TABLE sparse_proj_mat_with_grouping (
+ row_id integer,
+ col_id integer,
+ val_id integer,
+ matrix_id integer
+);
+COPY sparse_proj_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
+8|7|2|1
+9|3|4|1
+9|8|2|1
+1|2|4|2
+1|5|6|2
+6|6|12|2
+\.
+
+-- NOTE: Use the 'result_table_sparsed_grouped' created using the example shown
+-- in {schema_madlib}.pca_sparse_train('examples'), as the 'pc_table' parameter
+-- here. 'result_table_sparsed_grouped' was created with 'matrix_id' as the
+-- grouping column, and the table 'sparse_proj_mat_with_grouping' should also have
+-- the 'matrix_id' column in it.
+
+SELECT {schema_madlib}.pca_sparse_project(
+ 'sparse_proj_mat_with_grouping',
+ 'result_table_sparsed_grouped',
+ 'sparse_proj_mat_with_grouping_out',
+ 'row_id',
+ 'col_id',
+ 'val_id',
+ 10,
+ 10
+ );
+
+SELECT * FROM sparse_proj_mat_with_grouping_out;
+ """.format(schema_madlib=schema_madlib)
+ else:
+ return """
+----------------------------------------------------------------
+ Summary: PCA Projection
+----------------------------------------------------------------
+PCA Projection: Projects a dataset to an already trained
+space of principal components.
+--
+For function usage information, run:
+SELECT {schema_madlib}.pca_sparse_project('usage');
+--
""".format(schema_madlib=schema_madlib)
+def _validate_args_output_table(out_table, residual_table=None,
+ result_summary_table=None):
+ """
+ Validates the output table(s) arguments to the PCA project function
+
+ Args:
+ @param out_table Name of output table to store projection result
+ @param residual_table Name of the residual table (to store error in projection)
+ @param result_summary_table Name of result summary table
+ Returns:
+ None
+ Throws:
+ plpy.error if any argument is invalid
+ """
+ # Make sure that the output table does not exist
+ # Also check that the output table is not null
+ _assert(out_table and out_table.strip(),
+ "PCA error: Invalid output table name.")
+ _assert(not table_exists(out_table, only_first_schema=True),
+ "PCA error: Output table {0} already exists!".format(str(out_table)))
+
+ # Check that the result summary table is not empty
+ if result_summary_table is not None:
+ _assert(result_summary_table.strip(),
+ "PCA error: Invalid result summary table name!")
+ _assert(not table_exists(result_summary_table, only_first_schema=True),
+ "PCA error: Result summary table {0} already exists!".
+ format(result_summary_table))
+
+ # Check that the result summary table is not empty
+ if residual_table is not None:
+ _assert(residual_table.strip(),
+ "PCA error: Invalid residual table name!")
+ _assert(not table_exists(residual_table, only_first_schema=True),
+ "PCA error: Residual table {0} already exists!".
+ format(residual_table))
+
# Validate arguments: Same as pca
# ------------------------------------------------------------------------
def _validate_args(schema_madlib,
source_table,
pc_table,
- out_table,
row_id,
col_id=None,
val_id=None,
row_dim=None,
- col_dim=None,
- residual_table=None,
- result_summary_table=None):
+ col_dim=None):
"""
Validates all arguments passed to the PCA function
@@ -206,29 +408,6 @@ def _validate_args(schema_madlib,
_assert(table_exists(add_postfix(pc_table, "_mean")),
"PCA error: Source data table column means does not exist!")
- # Make sure that the output table does not exist
- # Also check that the output table is not null
- _assert(out_table and out_table.strip(),
- "PCA error: Invalid output table name.")
- _assert(not table_exists(out_table, only_first_schema=True),
- "PCA error: Output table {0} already exists!".format(str(out_table)))
-
- # Check that the result summary table is not empty
- if result_summary_table is not None:
- _assert(result_summary_table.strip(),
- "PCA error: Invalid result summary table name!")
- _assert(not table_exists(result_summary_table, only_first_schema=True),
- "PCA error: Result summary table {0} already exists!".
- format(result_summary_table))
-
- # Check that the result summary table is not empty
- if residual_table is not None:
- _assert(residual_table.strip(),
- "PCA error: Invalid residual table name!")
- _assert(not table_exists(residual_table, only_first_schema=True),
- "PCA error: Residual table {0} already exists!".
- format(residual_table))
-
# Check that the row_id exists
_assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
"PCA error: {1} column does not exist in {0}!".
@@ -254,7 +433,6 @@ def _validate_args(schema_madlib,
check_col=False)
# ------------------------------------------------------------------------
-
def pca_sparse_project(schema_madlib,
source_table,
pc_table,
@@ -290,65 +468,9 @@ def pca_sparse_project(schema_madlib,
Throws:
plpy.error if any argument is invalid
"""
-
- # Reset the message level to avoid random messages
- old_msg_level = plpy.execute("""
- SELECT setting
- FROM pg_settings
- WHERE name='client_min_messages'
- """)[0]['setting']
- plpy.execute('SET client_min_messages TO warning')
-
- # Step 1: Validate the input arguments
- _validate_args(schema_madlib,
- source_table,
- pc_table,
- out_table,
- row_id,
- col_id,
- val_id,
- row_dim,
- col_dim,
- residual_table,
- result_summary_table)
-
- # Step 2: Create a copy of the sparse matrix and add row_dims and col_dims
- # Warning: This changes the column names of the table
- sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
- create_temp_sparse_matrix_table_with_dims(source_table,
- sparse_table_copy,
- row_id,
- col_id,
- val_id,
- row_dim,
- col_dim)
-
- # Step 3: Densify the input matrix
- x_dense = "pg_temp." + unique_string() + "_dense"
- plpy.execute("""
- SELECT {schema_madlib}.matrix_densify(
- '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
- '{x_dense}', 'row=row_id, col=col_id,val=row_vec')
- """.format(**locals()))
-
- # Step 4: Pass the densified matrix to regular PCA
- pca_project(schema_madlib,
- x_dense,
- pc_table,
- out_table,
- 'row_id',
- residual_table,
- result_summary_table)
-
- # Step 4: Clean up
- plpy.execute(
- """
- DROP TABLE IF EXISTS {x_dense};
- DROP TABLE IF EXISTS {sparse_table_copy};
- """.format(x_dense=x_dense,
- sparse_table_copy=sparse_table_copy))
-
- plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+ pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+ row_id, residual_table, result_summary_table,
+ True, col_id, val_id, row_dim, col_dim)
# ------------------------------------------------------------------------
@@ -379,8 +501,21 @@ def pca_project(schema_madlib,
Throws:
plpy.error if any argument is invalid
"""
- t0 = time.time() # measure the starting time
+ pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+ row_id, residual_table, result_summary_table)
+
+def pca_project_wrap(schema_madlib, source_table, pc_table, out_table,
+ row_id, residual_table,
+ result_summary_table, is_sparse=False,
+ col_id=None, val_id=None, row_dim=None,
+ col_dim=None, **kwargs):
+ """
+ This wrapper was added to support grouping columns. This
+ function does the necessary pre-processing for handling
+ grouping_cols, if set. It then constructs a single query that
+ includes a separate "madlib.pca_project_wrap(...)" for each group.
+ """
# Reset the message level to avoid random messages
old_msg_level = plpy.execute("""
SELECT setting
@@ -388,24 +523,267 @@ def pca_project(schema_madlib,
WHERE name='client_min_messages'
""")[0]['setting']
plpy.execute('SET client_min_messages TO warning')
+ # If we add new columns to the pca_train output table in the future, they should
+ # be included in this list:
+ pc_table_model_cols = ['row_id', 'principal_components', 'std_dev', 'proportion']
+ grouping_cols_list = [col for col in get_cols(pc_table) if col not in pc_table_model_cols]
+ grouping_cols = ''
+ if grouping_cols_list:
+ grouping_cols = ', '.join(grouping_cols_list)
+ _validate_args_output_table(out_table, residual_table, result_summary_table)
+
+ other_columns_in_table = [col for col in get_cols(source_table) if col not in grouping_cols_list]
+ grouping_cols_clause = ''
+ if(grouping_cols):
+ # validate the grouping columns. We currently only support grouping_cols
+ # to be column names in the source_table, and not expressions!
+ _assert(columns_exist_in_table(source_table, grouping_cols_list, schema_madlib),
+ """PCA error: One or more grouping columns in {0} do not exist in {1}, but
+ the model in {2} was learnt with grouping!""".format(grouping_cols,
+ source_table, pc_table))
+ distinct_grouping_values = plpy.execute("""
+ SELECT DISTINCT {grouping_cols} FROM {source_table}
+ """.format(grouping_cols=grouping_cols, source_table=source_table))
+ cols_names_types = get_cols_and_types(source_table)
+ grouping_cols_clause = ', ' + ', '.join([c_name+" "+c_type
+ for (c_name, c_type) in cols_names_types if c_name in grouping_cols_list])
+ ## Create all output tables
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0};
+ CREATE TABLE {0} (
+ row_id INTEGER,
+ row_vec double precision[]
+ {1}
+ ) """.format(out_table, grouping_cols_clause))
+ if result_summary_table:
+ plpy.execute(
+ """
+ DROP TABLE IF EXISTS {0};
+ CREATE TABLE {0} (
+ exec_time FLOAT8,
+ residual_norm FLOAT8,
+ relative_residual_norm FLOAT8
+ {1}
+ ) """.format(result_summary_table, grouping_cols_clause))
+ else:
+ result_summary_table = ''
+ if residual_table and grouping_cols:
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0};
+ CREATE TABLE {0} (
+ row_id INTEGER,
+ row_vec double precision[]
+ {1}
+ ) """.format(residual_table, grouping_cols_clause))
+ if not residual_table:
+ residual_table = ''
+
+ # declare variables whose values will be different for each group, if
+ # grouping_cols is specified
+ grouping_where_clause = ''
+ select_grouping_cols = ''
+ temp_source_table_columns = ''
+ grouping_cols_values = ''
+ result_summary_table_temp = ''
+ other_columns_in_pc_table = [col for col in get_cols(pc_table) if col not in grouping_cols_list]
+ temp_pc_table_columns = ', '.join(other_columns_in_pc_table)
+ original_row_id = row_id
+ other_columns_in_table.remove(row_id)
+ temp_source_table_columns = """ ROW_NUMBER() OVER({partition_over}) AS row_id,
+ """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)) +\
+ ','.join(other_columns_in_table)
+ # We can now ignore the original row_id for all computations since we will
+ # create a new table with a row_id column that has perfect serially increasing
+ # row_id value. This is to support the scenario where users are not forced
+ # to have a row_id that follows a particular format. This restriction of having to
+ # provide a serially increasing row_id value starting from 1 becomes a pain
+ # point when grouping is used, since the row_id for each group will then have
+ # to start from 1.
+ row_id = 'row_id'
+ pca_union_call_list = []
+ grp_id = 0
+ if not is_sparse:
+ col_id = 'NULL'
+ val_id = 'NULL'
+ row_dim = 0
+ col_dim = 0
+ while True:
+ if grouping_cols:
+ grp_value_dict = distinct_grouping_values[grp_id]
+ grouping_where_clause = ' WHERE ' + ' AND '.join([str(key)+"="+str(value) for (key, value) in grp_value_dict.items()])
+ select_grouping_cols = ', ' + ', '.join([str(value)+" AS "+key for (key, value) in grp_value_dict.items()])
+ grouping_cols_values = ', ' + ', '.join([str(value) for (key, value) in grp_value_dict.items()])
+
+ pca_union_call_list.append("""
+ {schema_madlib}._pca_project_union('{source_table}', '{pc_table}', '{out_table}',
+ '{row_id}', '{original_row_id}', '{grouping_cols}',
+ '{grouping_cols_clause}', '{residual_table}',
+ '{result_summary_table}', {grp_id}, '{grouping_where_clause}', '{select_grouping_cols}',
+ '{grouping_cols_values}', '{temp_source_table_columns}', '{temp_pc_table_columns}',
+ {is_sparse}, '{col_id}', '{val_id}', {row_dim}, {col_dim})
+ """.format(schema_madlib=schema_madlib,
+ source_table=source_table, pc_table=pc_table,
+ out_table=out_table, row_id=row_id,
+ original_row_id=original_row_id,
+ grouping_cols=grouping_cols,
+ grouping_cols_clause=grouping_cols_clause,
+ residual_table=residual_table,
+ result_summary_table=result_summary_table,
+ grp_id=grp_id, grouping_where_clause=grouping_where_clause,
+ select_grouping_cols=select_grouping_cols,
+ grouping_cols_values=grouping_cols_values,
+ temp_source_table_columns=temp_source_table_columns,
+ temp_pc_table_columns=temp_pc_table_columns, is_sparse=is_sparse,
+ col_id=col_id, val_id=val_id, row_dim=row_dim, col_dim=col_dim))
+ grp_id += 1
+ if not grouping_cols_list or len(distinct_grouping_values) == grp_id:
+ break
+ # "SELECT <query_1>, <query_2>, <query_3>, ..." is expected to run each
+ # <query_i> in parallel.
+ pca_union_call = 'SELECT ' + ', '.join(pca_union_call_list)
+ try:
+ plpy.execute(pca_union_call)
+ except Exception as e:
+ ## drop the output tables that were created if PCA errored out.
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0};
+ """.format(out_table))
+ if result_summary_table:
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0}
+ """.format(result_summary_table))
+ if residual_table:
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0}
+ """.format(residual_table))
+ plpy.error(str(e) + "\n" + str(e.args) + "\n" + str(e.strerror))
+ raise
+
+ plpy.execute("SET client_min_messages TO %s" % old_msg_level)
+
+def _pca_project_union(schema_madlib, source_table, pc_table, out_table,
+ row_id, original_row_id, grouping_cols, grouping_cols_clause,
+ residual_table, result_summary_table, grp_id, grouping_where_clause,
+ select_grouping_cols, grouping_cols_values, temp_source_table_columns,
+ temp_pc_table_columns, is_sparse, col_id, val_id, row_dim, col_dim, **kwargs):
+ """
+ The pca_project is performed over each group, if any.
+
+ Args:
+ @param schema_madlib -- madlib schema name
+ @param source_table -- Source table name (dense matrix)
+ @param pc_table -- Output table name for the principal components
+ @param out_table -- Output table name
+ @param row_id -- Column name for the ID for each row
+ @param original_row_id -- copy of the row_id originally passed
+ @param grouping_cols -- Comma-separated list of grouping columns (Default: NULL)
+ @param grouping_cols_clause -- Part of the SQL query to be used with grouping_cols
+ @param residual_table -- Residual table name
+ @param result_summary_table -- Table name to store summary of results (Default: NULL)
+ @param grp_id -- a place holder id for each group
+ @param grouping_where_clause -- WHERE clause using grouping_cols
+ @param select_grouping_cols -- SELECT clause using grouping_cols
+ @param grouping_cols_values -- distinct values of the grouping_cols
+ @param temp_source_table_columns -- SELECT caluse for creating temporary copy of the source_table
+ @param temp_pc_table_columns -- non grouping_cols of the source_table
+ @param is_sparse -- specifies if the PCA call is for sparse or dense matrices
+ @param col_id -- sparse representation based detail
+ @param val_id -- sparse representation based detail
+ @param row_dim -- sparse representation based detail
+ @param col_dim -- sparse representation based detail
+
+ Returns:
+ None
+ """
+ out_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+ ## Creation of this temp table is unnecessary if the scenario does not involve
+ ## grouping, and/or, the input table had perfect values for the row_id column.
+ ## This temp table will ensure pca works even when row_id of the source_table
+ ## does not have serially increasing numbers starting from 1;
+ source_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+ plpy.execute("""
+ CREATE TEMP TABLE {source_table_grouped} AS
+ SELECT {temp_source_table_columns}
+ FROM {source_table}
+ {grouping_where_clause}
+ """.format(source_table_grouped=source_table_grouped,
+ source_table=source_table, grouping_where_clause=grouping_where_clause,
+ temp_source_table_columns=temp_source_table_columns))
+ if grouping_cols:
+ pc_table_grouped = "pg_temp." + unique_string() + "group_" + str(grp_id)
+ plpy.execute("""
+ CREATE TEMP TABLE {pc_table_grouped} AS
+ SELECT {temp_pc_table_columns}
+ FROM {pc_table}
+ {grouping_where_clause}
+ """.format(pc_table_grouped=pc_table_grouped,
+ pc_table=pc_table, grouping_where_clause=grouping_where_clause,
+ temp_pc_table_columns=temp_pc_table_columns))
+ else:
+ pc_table_grouped = pc_table
+ t0 = time.time() # measure the starting time
# Step 1: Validate the input arguments
- _validate_args(schema_madlib, source_table, pc_table, out_table,
- row_id, None, None, None, None,
- residual_table, result_summary_table)
+ if is_sparse:
+ _validate_args(schema_madlib, source_table_grouped, pc_table, row_id,
+ col_id, val_id, row_dim, col_dim)
+ # Step 1.1: Create a copy of the sparse matrix and add row_dims and col_dims
+ # Warning: This changes the column names of the table
+ sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
+ create_temp_sparse_matrix_table_with_dims(source_table_grouped, sparse_table_copy,
+ row_id, col_id, val_id,
+ row_dim, col_dim)
+
+ # Step 1.2: Densify the input matrix
+ x_dense = "pg_temp." + unique_string() + "_dense"
+ plpy.execute("""
+ SELECT {schema_madlib}.matrix_densify(
+ '{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
+ '{x_dense}', 'row=row_id, col=col_id,val=row_vec')
+ """.format(schema_madlib=schema_madlib,
+ sparse_table_copy=sparse_table_copy, row_id=row_id,
+ col_id=col_id, val_id=val_id, x_dense=x_dense))
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0};
+ """.format(sparse_table_copy))
+ source_table_grouped = x_dense
+ row_id = 'row_id'
+ else:
+ # Preserve the mapping between new row_id created and the original row_id. This is
+ # required only for dense input format.
+ temp_row_id = original_row_id + unique_string()
+ row_id_map_table = "rowid" + unique_string()
+ plpy.execute("""
+ CREATE TEMP TABLE {row_id_map_table} AS
+ SELECT
+ {original_row_id} AS {temp_row_id},
+ {select_clause}
+ FROM {source_table}
+ {grouping_where_clause}
+ """.format(row_id_map_table=row_id_map_table,
+ original_row_id=original_row_id,
+ temp_row_id=temp_row_id,
+ source_table=source_table,
+ select_clause="""
+ ROW_NUMBER() OVER({partition_over}) AS row_id
+ """.format(partition_over='' if not grouping_cols else 'PARTITION BY {0}'.format(grouping_cols)),
+ grouping_where_clause=grouping_where_clause))
+ ## Validate the arguments
+ _validate_args(schema_madlib, source_table_grouped, pc_table,
+ row_id, None, None, None, None)
# Make sure that the table has row_id and row_vec
source_table_copy = "pg_temp." + unique_string()
need_new_column_names = cast_dense_input_table_to_correct_columns(
- schema_madlib, source_table, source_table_copy, row_id)
+ schema_madlib, source_table_grouped, source_table_copy, row_id)
if(need_new_column_names):
- source_table = source_table_copy
+ source_table_grouped = source_table_copy
- [row_dim, col_dim] = get_dims(source_table,
+ [row_dim, col_dim] = get_dims(source_table_grouped,
{'row': 'row_id', 'col': 'col_id',
'val': 'row_vec'})
- validate_dense(source_table,
+ validate_dense(source_table_grouped,
{'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'},
check_col=False, row_dim=row_dim)
@@ -426,49 +804,57 @@ def pca_project(schema_madlib,
row_id,
({schema_madlib}.utils_normalize_data(
row_vec,
- (select column_mean from {pc_table_mean}),
+ (select column_mean from {pc_table_mean}
+ {grouping_where_clause}),
'{x_std_str}'::double precision[]))
AS row_vec
- FROM {source_table}
+ FROM {source_table_grouped}
""".format(schema_madlib=schema_madlib,
pc_table_mean=pc_table_mean,
- source_table=source_table,
+ source_table_grouped=source_table_grouped,
scaled_source_table=scaled_source_table,
+ grouping_where_clause=grouping_where_clause,
x_std_str=x_std_str))
plpy.execute(
"""
SELECT {schema_madlib}.matrix_mult('{scaled_source_table}',
'trans=false,row=row_id, col=col_id, val=row_vec',
- '{pc_table}',
+ '{pc_table_grouped}',
'trans=TRUE, row=row_id, col=col_id, val=principal_components',
- '{out_table}',
+ '{out_table_grouped}',
'row=row_id, col=col_id,val=row_vec');
""".format(schema_madlib=schema_madlib,
scaled_source_table=scaled_source_table,
- pc_table=pc_table,
- out_table=out_table))
+ pc_table_grouped=pc_table_grouped,
+ out_table_grouped=out_table_grouped))
# Step 3: Compute the Residual table (if required)
# Residual table: res = mat - proj
+ create_residual_table = False
if residual_table or result_summary_table:
+ if grouping_cols:
+ residual_table_grouped = "pg_temp." + unique_string() + "_temp_residual"
+ else:
+ residual_table_grouped = residual_table
create_temp_residual_table = False
if not residual_table:
create_temp_residual_table = True
- residual_table = "pg_temp." + unique_string() + "_temp_residual"
+ else:
+ create_residual_table = True
approx_table = "pg_temp." + unique_string() + "_approx"
# Build an approximate reconstruction of the data
plpy.execute(
"""
- SELECT {schema_madlib}.matrix_mult('{out_table}',
+ SELECT {schema_madlib}.matrix_mult('{out_table_grouped}',
'row=row_id, col=col_id, val=row_vec',
- '{pc_table}',
+ '{pc_table_grouped}',
'row=row_id, col=col_id, val=principal_components',
'{approx_table}',
'row=row_id, col=col_id, val=row_vec');
""".format(schema_madlib=schema_madlib,
- out_table=out_table,
- pc_table=pc_table,
+ out_table_grouped=out_table_grouped,
+ pc_table_grouped=pc_table_grouped,
approx_table=approx_table))
# Compute the difference between the reconstruction and real data
@@ -481,12 +867,12 @@ def pca_project(schema_madlib,
'{approx_table}',
'row=row_id, col=col_id, val=row_vec',
-1,
- '{residual_table}',
+ '{residual_table_grouped}',
'row=row_id, col=col_id, val=row_vec');
""".format(schema_madlib=schema_madlib,
scaled_source_table=scaled_source_table,
approx_table=approx_table,
- residual_table=residual_table))
+ residual_table_grouped=residual_table_grouped))
# Step 4: Compute the results summary table (if required)
# If the residual table is not asked by the user, but he does ask for
@@ -494,19 +880,19 @@ def pca_project(schema_madlib,
if result_summary_table:
source_table_norm = plpy.execute(
"""
- SELECT {schema_madlib}.matrix_norm('{source_table}',
+ SELECT {schema_madlib}.matrix_norm('{source_table_grouped}',
'row=row_id, col=col_id, val=row_vec') as r
""".format(schema_madlib=schema_madlib,
- source_table=source_table,
+ source_table_grouped=source_table_grouped,
row_id=row_id))[0]['r']
# Compute the norm of the residual table
residual_norm = plpy.execute(
"""
- SELECT {schema_madlib}.matrix_norm('{residual_table_name}',
+ SELECT {schema_madlib}.matrix_norm('{residual_table_grouped}',
'row=row_id, col=col_id, val=row_vec') as r
""".format(schema_madlib=schema_madlib,
- residual_table_name=residual_table,
+ residual_table_grouped=residual_table_grouped,
row_id=row_id))[0]['r']
# Compute the relative error of the norm
# Prevent division by zero
@@ -514,12 +900,6 @@ def pca_project(schema_madlib,
relative_residual_norm = residual_norm / source_table_norm
else:
relative_residual_norm = 0
- plpy.execute(
- """
- CREATE TABLE {result_summary_table} ( exec_time FLOAT8,
- residual_norm FLOAT8,
- relative_residual_norm FLOAT8);
- """.format(result_summary_table=result_summary_table))
# Compute the time in milli-seconds
t1 = time.time()
dt = (t1 - t0) * 1000.
@@ -529,19 +909,87 @@ def pca_project(schema_madlib,
INSERT INTO {result_summary_table} VALUES
({dt},
{residual_norm}::double precision,
- {relative_residual_norm}::double precision);
- """.format(dt=dt,
- residual_norm=residual_norm,
+ {relative_residual_norm}::double precision
+ {grouping_cols_values}
+ );
+ """.format(dt=dt, residual_norm=residual_norm,
result_summary_table=result_summary_table,
- relative_residual_norm=relative_residual_norm))
+ relative_residual_norm=relative_residual_norm,
+ grouping_cols_values=grouping_cols_values))
plpy.execute("""
DROP TABLE IF EXISTS {approx_table};
""".format(approx_table=approx_table))
if create_temp_residual_table:
plpy.execute("""
- DROP TABLE IF EXISTS {residual_table};
- """.format(residual_table=residual_table))
+ DROP TABLE IF EXISTS {0};
+ """.format(residual_table_grouped))
+
+ if is_sparse:
+ ## We don't have to join based on row_id for sparse project.
+ if create_residual_table:
+ if grouping_cols:
+ plpy.execute("""
+ INSERT INTO {residual_table}
+ SELECT * {select_grouping_cols}
+ FROM {residual_table_grouped}
+ """.format(residual_table=residual_table,
+ select_grouping_cols=select_grouping_cols,
+ residual_table_grouped=residual_table_grouped))
+ plpy.execute("""
+ INSERT INTO {out_table}
+ SELECT * {select_grouping_cols}
+ FROM {out_table_grouped}
+ """.format(out_table=out_table,
+ select_grouping_cols=select_grouping_cols,
+ out_table_grouped=out_table_grouped))
+ else:
+ output_table_cols = get_cols(out_table_grouped)
+ output_table_cols.remove('row_id')
+ output_table_select_clause = """{row_id_map_table}.{temp_row_id},
+ {out_table_cols}
+ {select_grouping_cols}
+ """.format(row_id_map_table=row_id_map_table,
+ temp_row_id=temp_row_id,
+ out_table_cols=', '.join(output_table_cols),
+ select_grouping_cols=select_grouping_cols)
+ if create_residual_table:
+ if grouping_cols:
+ plpy.execute("""
+ INSERT INTO {residual_table}
+ SELECT {select_clause}
+ FROM {residual_table_grouped}
+ INNER JOIN {row_id_map_table}
+ ON {row_id_map_table}.row_id={residual_table_grouped}.row_id
+ """.format(residual_table=residual_table,
+ select_clause=output_table_select_clause,
+ residual_table_grouped=residual_table_grouped,
+ row_id_map_table=row_id_map_table))
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0}
+ """.format(residual_table_grouped))
+
+ plpy.execute("""
+ INSERT INTO {out_table}
+ SELECT {select_clause}
+ FROM {out_table_grouped}
+ INNER JOIN {row_id_map_table}
+ ON {row_id_map_table}.row_id={out_table_grouped}.row_id
+ """.format(out_table=out_table,
+ select_clause=output_table_select_clause,
+ out_table_grouped=out_table_grouped,
+ row_id_map_table=row_id_map_table))
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0};
+ """.format(row_id_map_table))
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0};
+ DROP TABLE IF EXISTS {1};
+ DROP TABLE IF EXISTS {2};
+ """.format(scaled_source_table,
+ source_table_grouped, out_table_grouped))
+ if grouping_cols:
+ plpy.execute("""
+ DROP TABLE IF EXISTS {0};
+ """.format(pc_table_grouped))
- plpy.execute("DROP TABLE IF EXISTS {0}".format(scaled_source_table))
- plpy.execute("SET client_min_messages TO %s" % old_msg_level)
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca_project.sql_in b/src/ports/postgres/modules/pca/pca_project.sql_in
index fde15d0..814292c 100644
--- a/src/ports/postgres/modules/pca/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/pca_project.sql_in
@@ -259,6 +259,219 @@ SELECT * FROM residual_table ORDER BY row_id;
5 | {-0.0333376637524658,-0.455268589780183,1.55376831915842}
6 | {0.00277445701511336,0.037888771752409,-0.129309103509956}
</pre>
+-# Run the PCA project function on subsets of an input table based on grouping columns.
+Note that the 'pc_table' used for projection must also be generated using the same
+grouping columns. Create sample data:
+<pre class="example">
+DROP TABLE IF EXISTS mat_with_grouping;
+CREATE TABLE mat_with_grouping (
+ row_id integer,
+ row_vec double precision[],
+ matrix_id integer
+);
+COPY mat_with_grouping (row_id, row_vec, matrix_id) FROM stdin DELIMITER '|';
+1|{1,2,3}|1
+2|{2,1,2}|1
+3|{3,2,1}|1
+4|{1,2,3,4,5}|2
+5|{2,1,2,4,5}|2
+6|{3,2,1,4,5}|2
+7|{1,2,4,4,5}|2
+8|{1,3,3,4,5}|2
+9|{2,2,2,4,5}|2
+10|{3,3,2,4,5}|2
+11|{2,2,3,4,5}|2
+\.
+</pre>
+-# Run the PCA function using grouping_cols and capture 90% of the proportion of variance:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_grp;
+DROP TABLE IF EXISTS result_table_grp_mean;
+SELECT madlib.pca_train( 'mat_with_grouping',
+ 'result_table_grp',
+ 'row_id',
+ 0.9,
+ 'matrix_id'
+ );
+</pre>
+-# Project the original data to low-dimensional representation, based on independent PCA models
+learnt for each group:
+<pre class="example">
+DROP TABLE IF EXISTS mat_with_grouping_projected;
+SELECT madlib.pca_project(
+ 'mat_with_grouping',
+ 'result_table_grp',
+ 'mat_with_grouping_projected',
+ 'row_id'
+ );
+</pre>
+-# Check the error in the projection:
+<pre class="example">
+SELECT * FROM mat_with_grouping_projected;
+</pre>
+<pre class="result">
+ row_id | row_vec | matrix_id
+--------+----------------------------------------+-----------
+ 1 | {1.41421356237309,0.33333333333} | 1
+ 2 | {1.08709337828423e-16,-0.66666666667} | 1
+ 3 | {-1.41421356237309,0.33333333333} | 1
+ 4 | {-0.947334292839873,0.757198329919689} | 2
+ 5 | {0.503945652182431,-0.357198389962114} | 2
+ 6 | {1.87289054234556,-1.63346278982675} | 2
+ 7 | {-1.69764931050669,1.55408536983517} | 2
+ 8 | {-0.98850182026946,0.676264489928273} | 2
+ 9 | {0.462778124752844,-0.43813222995353} | 2
+ 10 | {1.08140799724916,-0.917509589902686} | 2
+ 11 | {-0.287536892913973,0.358754809961949} | 2
+</pre>
+
+-# Create sample data in sparse matrix form:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+ row_id integer,
+ col_id integer,
+ val_id integer
+ );
+INSERT INTO sparse_mat VALUES
+(1, 2, 4.0),
+(1, 5, 6.0),
+(3, 8, 4.0),
+(5, 4, 2.0),
+(6, 6, 12.0),
+(8, 1, 2.0),
+(8, 7, 2.0),
+(9, 3, 4.0),
+(9, 8, 2.0);
+</pre>
+-# Run the PCA sparse function for a specified number of principal components:
+<pre class="example">
+DROP TABLE IF EXISTS result_table, result_table_mean;
+SELECT madlib.pca_sparse_train(
+ 'sparse_mat',
+ 'result_table',
+ 'row_id',
+ 'col_id',
+ 'val_id',
+ 9,
+ 8,
+ 5);
+</pre>
+-# Project the original sparse data to low-dimensional representation:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat_out;
+SELECT madlib.pca_sparse_project(
+ 'sparse_mat',
+ 'result_table',
+ 'sparse_mat_out',
+ 'row_id',
+ 'col_id',
+ 'val_id',
+ 9,
+ 8
+ );
+</pre>
+-# Check the error in the projection:
+<pre class="example">
+SELECT * FROM sparse_mat_out ORDER BY row_id;
+</pre>
+<pre class="result">
+ row_id | row_vec
+--------+----------------------------------------------------------------------------------------------------
+ 1 | {1.34695183995672,0.879135043008323,1.87818383323404,-2.82842712474619,1.24074659599017}
+ 2 | {1.63437198343112,-5.32859025290785,-0.240160697276225,-2.36847578586602e-15,0.295172631209684}
+ 3 | {1.43909964102331,1.28442554799452,-2.99624380589971,-7.21876262469717e-15,1.06626861836748}
+ 4 | {1.21839219844249,0.517445462630873,0.298488396687886,2.88426689939176e-15,-1.34560049142698}
+ 5 | {-10.7325138823598,-0.136175420017185,-0.0338708165820332,8.58109879450845e-16,0.0577033753567249}
+ 6 | {1.21839219844249,0.517445462630873,0.298488396687885,2.1626219333854e-15,-1.34560049142698}
+ 7 | {1.21839219844249,0.517445462630873,0.298488396687885,1.91282175284474e-15,-1.34560049142698}
+ 8 | {1.34695183995672,0.879135043008322,1.87818383323403,2.82842712474619,1.24074659599017}
+ 9 | {1.30996198263469,0.86973365102307,-1.38155753676828,-2.88889282865906e-15,0.136163657369328}
+</pre>
+-# Create sample data in sparse matrix form, with a grouping column:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat_with_grouping;
+CREATE TABLE sparse_mat_with_grouping (
+ row_id integer,
+ col_id integer,
+ val_id integer,
+ matrix_id integer
+);
+COPY sparse_mat_with_grouping (row_id, col_id, val_id, matrix_id) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|1
+8|1|2|1
+8|7|2|1
+9|3|4|1
+9|8|2|1
+1|2|4|2
+1|5|6|2
+6|6|12|2
+8|1|2|2
+8|7|2|2
+\.
+</pre>
+-# Run the PCA sparse function using grouping_cols and capture 90% of the proportion of variance:
+<pre class="example">
+DROP TABLE IF EXISTS result_table_grouped, result_table_grouped_mean;
+SELECT madlib.pca_sparse_train (
+ 'sparse_mat_with_grouping',
+ 'result_table_grouped',
+ 'row_id',
+ 'col_id',
+ 'val_id',
+ 10,
+ 10,
+ 0.9,
+ 'matrix_id'
+ );
+</pre>
+-# Project the original sparse data to low-dimensional representation, based on independent PCA models
+learnt for each group:
+<pre class="example">
+DROP TABLE IF EXISTS sparse_mat_with_grouping_projected;
+SELECT madlib.pca_sparse_project(
+ 'sparse_mat_with_grouping',
+ 'result_table_grouped',
+ 'sparse_mat_with_grouping_projected',
+ 'row_id',
+ 'col_id',
+ 'val_id',
+ 10,
+ 10
+ );
+</pre>
+-# Check the error in the projection:
+<pre class="example">
+SELECT * FROM sparse_mat_with_grouping_projected ORDER BY matrix_id,row_id;
+</pre>
+<pre class="result">
+ row_id | row_vec | matrix_id
+--------+---------------------------------------------------------------------------------+-----------
+ 1 | {1.22692546452761,0.818988440128327,1.81591522940788,2.82842712474619} | 1
+ 2 | {1.4852284082451,-5.38921614355968,-0.238413962586454,-2.76195922299053e-16} | 1
+ 3 | {1.30995313897398,1.19175698765945,-3.05860526896832,-2.63021567919958e-15} | 1
+ 4 | {1.11099454391467,0.484270184421277,0.294314185921038,-1.61822162787753e-16} | 1
+ 5 | {-10.8525988405155,-0.128517253667187,-0.0335370457586047,8.53109060561845e-16} | 1
+ 6 | {1.11099454391467,0.484270184421277,0.294314185921038,2.9799754824927e-16} | 1
+ 7 | {1.11099454391467,0.484270184421277,0.294314185921038,-8.08358590285308e-16} | 1
+ 8 | {1.22692546452761,0.818988440128326,1.81591522940789,-2.82842712474619} | 1
+ 9 | {1.19351195715706,0.808981987902128,-1.41427400274461,-1.38815367040019e-15} | 1
+ 10 | {1.07707077534015,0.42620698814481,0.230057263479109,-1.46091661600792e-16} | 1
+ 1 | {-1.28753936783518,4.42689855953263} | 2
+ 2 | {-1.56033687316063,0.102219682546935} | 2
+ 3 | {10.8262428615875,-0.605455042777997} | 2
+ 4 | {-1.16530001926412,-0.290932942633583} | 2
+ 5 | {-1.16530001926412,-0.605455042777997} | 2
+ 6 | {-1.12955331641269,-0.605455042777998} | 2
+ 7 | {-1.12955331641269,-0.605455042777998} | 2
+ 8 | {-1.12955331641269,-0.605455042777998} | 2
+ 9 | {-1.12955331641269,-0.605455042777998} | 2
+ 10 | {-1.12955331641269,-0.605455042777998} | 2
+</pre>
@anchor notes
@par Notes
@@ -333,7 +546,7 @@ MADLIB_SCHEMA.pca_project(
out_table TEXT, -- Output table name for the principal components
row_id TEXT, -- Column name for the ID for each row
residual_table TEXT, -- Residual table (Default: NULL)
- result_summary_table TEXT -- Table name to store summary of results (Default: NULL)
+ result_summary_table TEXT -- Table name to store summary of results (Default: NULL)
)
RETURNS VOID AS $$
PythonFunction(pca, pca_project, pca_project)
@@ -457,6 +670,33 @@ $$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._pca_project_union(
+ source_table TEXT, -- Source table name (dense matrix)
+ pc_table TEXT, -- Output table name for the principal components
+ out_table TEXT, -- Output table name
+ row_id TEXT, -- Column name for the ID for each row
+ original_row_id TEXT, -- copy of the row_id originally passed
+ grouping_cols TEXT, -- Comma-separated list of grouping columns (Default: NULL)
+ grouping_cols_clause TEXT, -- Part of the SQL query to be used with grouping_cols
+ residual_table TEXT, -- Residual table name
+ result_summary_table TEXT, -- Table name to store summary of results (Default: NULL)
+ grp_id INTEGER, -- a place holder id for each group
+ grouping_where_clause TEXT, -- WHERE clause using grouping_cols
+ select_grouping_cols TEXT, -- SELECT clause using grouping_cols
+ grouping_cols_values TEXT, -- distinct values of the grouping_cols
+ temp_source_table_columns TEXT, -- SELECT caluse for creating temporary copy of the source_table
+ temp_pc_table_columns TEXT, -- non grouping_cols of the source_table
+ is_sparse BOOLEAN, -- specifies if the PCA call is for sparse or dense matrices
+ col_id TEXT, -- sparse representation based detail
+ val_id TEXT, -- sparse representation based detail
+ row_dim INTEGER, -- sparse representation based detail
+ col_dim INTEGER -- sparse representation based detail
+)
+RETURNS VOID AS $$
+PythonFunction(pca, pca_project, _pca_project_union)
+$$ LANGUAGE plpythonu
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
-- Help and usage functions
-----------------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/test/pca.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/test/pca.sql_in b/src/ports/postgres/modules/pca/test/pca.sql_in
index 8cff3db..20093ac 100644
--- a/src/ports/postgres/modules/pca/test/pca.sql_in
+++ b/src/ports/postgres/modules/pca/test/pca.sql_in
@@ -118,6 +118,86 @@ NULL, 5, FALSE, 'result_table_214712398172490838');
select * from result_table_214712398172490837;
select * from result_table_214712398172490838;
+-- Test dense data with grouping
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+ id integer,
+ row_vec double precision[],
+ grp integer
+);
+
+COPY mat (id, row_vec, grp) FROM stdin delimiter '|';
+1|{396,840,353,446,318,886,15,584,159,383}|1
+2|{691,58,899,163,159,533,604,582,269,390}|1
+3|{293,742,298,75,404,857,941,662,846,2}|1
+4|{462,532,787,265,982,306,600,608,212,885}|1
+5|{304,151,337,387,643,753,603,531,459,652}|1
+6|{327,946,368,943,7,516,272,24,591,204}|1
+7|{877,59,260,302,891,498,710,286,864,675}|1
+8|{458,959,774,376,228,354,300,669,718,565}|2
+9|{824,390,818,844,180,943,424,520,65,913}|2
+10|{882,761,398,688,761,405,125,484,222,873}|2
+11|{528,1,860,18,814,242,314,965,935,809}|2
+12|{492,220,576,289,321,261,173,1,44,241}|2
+13|{415,701,221,503,67,393,479,218,219,916}|2
+14|{350,192,211,633,53,783,30,444,176,932}|2
+15|{909,472,871,695,930,455,398,893,693,838}|2
+16|{739,651,678,577,273,935,661,47,373,618}|2
+\.
+
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 0.8,
+'grp', 5, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
+-- Matrix in the column format
+DROP TABLE IF EXISTS cmat;
+CREATE TABLE cmat (
+ id integer,
+ val0 float8,
+ val1 float8,
+ val2 float8,
+ val3 float8,
+ val4 float8,
+ val5 float8,
+ val6 float8,
+ val7 float8,
+ val8 float8,
+ val9 float8,
+ grp integer
+);
+
+COPY cmat (id, val0, val1, val2, val3, val4, val5, val6, val7, val8, val9, grp) FROM stdin delimiter '|';
+1|396|840|353|446|318|886|15|584|159|383|1
+2|691|58|899|163|159|533|604|582|269|390|1
+3|293|742|298|75|404|857|941|662|846|2|1
+4|462|532|787|265|982|306|600|608|212|885|1
+5|304|151|337|387|643|753|603|531|459|652|1
+6|327|946|368|943|7|516|272|24|591|204|1
+7|877|59|260|302|891|498|710|286|864|675|2
+8|458|959|774|376|228|354|300|669|718|565|2
+9|824|390|818|844|180|943|424|520|65|913|2
+10|882|761|398|688|761|405|125|484|222|873|2
+11|528|1|860|18|814|242|314|965|935|809|2
+12|492|220|576|289|321|261|173|1|44|241|2
+13|415|701|221|503|67|393|479|218|219|916|2
+14|350|192|211|633|53|783|30|444|176|932|2
+15|909|472|871|695|930|455|398|893|693|838|2
+16|739|651|678|577|273|935|661|47|373|618|2
+\.
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 0.8,
+'grp', 5, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
-- SPARSE PCA: Make sure all possible default calls for sparse PCA work
-----------------------------------------------------------------------------
@@ -228,9 +308,37 @@ select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
select * from result_table_214712398172490837;
select * from result_table_214712398172490838;
+-- Sparse input data with grouping column
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+ id integer,
+ col_id integer,
+ val_id integer,
+ grp integer
+);
+COPY sparse_mat (id, col_id, val_id, grp) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|2
+8|7|2|2
+8|1|2|2
+9|8|2|2
+9|3|4|2
+\.
+-- Learn individaul PCA models based on grouping column (grp)
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+drop table if exists result_table_214712398172490838;
+select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
+'id', 'col_id', 'val_id', 10, 10, 0.8, 'grp', 0, FALSE, 'result_table_214712398172490838');
+select * from result_table_214712398172490837;
+select * from result_table_214712398172490838;
+
-------------------------------------------------------------------------
-- test a different column name
-alter table sparse_mat rename column row_id to rownr;
+alter table sparse_mat rename column id to rownr;
alter table sparse_mat rename column col_id to colnr;
alter table sparse_mat rename column val_id to vals;
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8bd46ae8/src/ports/postgres/modules/pca/test/pca_project.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/test/pca_project.sql_in b/src/ports/postgres/modules/pca/test/pca_project.sql_in
index 82c547a..beb0ef4 100644
--- a/src/ports/postgres/modules/pca/test/pca_project.sql_in
+++ b/src/ports/postgres/modules/pca/test/pca_project.sql_in
@@ -30,8 +30,6 @@ drop table if exists result_table_214712398172490837;
drop table if exists result_table_214712398172490837_mean;
select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
'row_id', 'col_id', 'val_id', 10, 10, 10);
-select * from result_table_214712398172490837;
-
drop table if exists out_table_214712398172490837;
select pca_sparse_project( 'sparse_mat',
@@ -120,6 +118,7 @@ select pca_project( 'mat',
drop table if exists out_table_214712398172490837;
drop table if exists residual_table_214712398172490837;
+drop table if exists result_summary_table_214712398172490837;
select pca_project( 'mat',
'result_table_214712398172490837',
'out_table_214712398172490837',
@@ -172,4 +171,129 @@ select pca_project( 'cmat',
'out_table_214712398172490837',
'row_id');
+-- Sparse matrix with grouping column
+DROP TABLE IF EXISTS sparse_mat;
+CREATE TABLE sparse_mat (
+ id integer,
+ col_id integer,
+ val_id integer,
+ grp integer
+);
+COPY sparse_mat (id, col_id, val_id, grp) FROM stdin delimiter '|';
+1|2|4|1
+1|5|6|1
+3|8|4|1
+5|4|2|1
+6|6|12|2
+8|7|2|2
+8|1|2|2
+9|8|2|2
+9|3|4|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_sparse_train('sparse_mat', 'result_table_214712398172490837',
+'id', 'col_id', 'val_id', 10, 10, 10, 'grp');
+
+drop table if exists out_table_214712398172490837;
+drop table if exists residual_table_214712398172490837;
+drop table if exists summary_table_214712398172490837;
+select pca_sparse_project( 'sparse_mat',
+ 'result_table_214712398172490837',
+ 'out_table_214712398172490837',
+ 'id',
+ 'col_id',
+ 'val_id',
+ 10,
+ 10,
+ 'residual_table_214712398172490837',
+ 'summary_table_214712398172490837');
+
+-- Test data (Indentical to SVD) with grouping column
+DROP TABLE IF EXISTS mat;
+CREATE TABLE mat (
+ id integer,
+ row_vec double precision[],
+ grp integer
+);
+
+COPY mat (id, row_vec, grp) FROM stdin delimiter '|';
+1|{396,840,353,446,318,886,15,584,159,383}|1
+2|{691,58,899,163,159,533,604,582,269,390}|1
+3|{293,742,298,75,404,857,941,662,846,2}|1
+4|{462,532,787,265,982,306,600,608,212,885}|1
+5|{304,151,337,387,643,753,603,531,459,652}|1
+6|{327,946,368,943,7,516,272,24,591,204}|1
+7|{877,59,260,302,891,498,710,286,864,675}|1
+8|{458,959,774,376,228,354,300,669,718,565}|2
+9|{824,390,818,844,180,943,424,520,65,913}|2
+10|{882,761,398,688,761,405,125,484,222,873}|2
+11|{528,1,860,18,814,242,314,965,935,809}|2
+12|{492,220,576,289,321,261,173,1,44,241}|2
+13|{415,701,221,503,67,393,479,218,219,916}|2
+14|{350,192,211,633,53,783,30,444,176,932}|2
+15|{909,472,871,695,930,455,398,893,693,838}|2
+16|{739,651,678,577,273,935,661,47,373,618}|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_train('mat', 'result_table_214712398172490837', 'id', 5, 'grp');
+
+drop table if exists out_table_214712398172490837;
+drop table if exists residual_table_214712398172490837;
+drop table if exists result_summary_table_214712398172490837;
+select pca_project( 'mat',
+ 'result_table_214712398172490837',
+ 'out_table_214712398172490837',
+ 'id',
+ 'residual_table_214712398172490837',
+ 'result_summary_table_214712398172490837');
+
+
+-- Matrix in the column format with grouping column
+DROP TABLE IF EXISTS cmat;
+CREATE TABLE cmat (
+ id integer,
+ val0 float8,
+ val1 float8,
+ val2 float8,
+ val3 float8,
+ val4 float8,
+ val5 float8,
+ val6 float8,
+ val7 float8,
+ val8 float8,
+ val9 float8,
+ grp integer
+);
+
+COPY cmat (id, val0, val1, val2, val3, val4, val5, val6, val7, val8, val9, grp) FROM stdin delimiter '|';
+1|396|840|353|446|318|886|15|584|159|383|1
+2|691|58|899|163|159|533|604|582|269|390|1
+3|293|742|298|75|404|857|941|662|846|2|1
+4|462|532|787|265|982|306|600|608|212|885|1
+5|304|151|337|387|643|753|603|531|459|652|1
+6|327|946|368|943|7|516|272|24|591|204|1
+7|877|59|260|302|891|498|710|286|864|675|2
+8|458|959|774|376|228|354|300|669|718|565|2
+9|824|390|818|844|180|943|424|520|65|913|2
+10|882|761|398|688|761|405|125|484|222|873|2
+11|528|1|860|18|814|242|314|965|935|809|2
+12|492|220|576|289|321|261|173|1|44|241|2
+13|415|701|221|503|67|393|479|218|219|916|2
+14|350|192|211|633|53|783|30|444|176|932|2
+15|909|472|871|695|930|455|398|893|693|838|2
+16|739|651|678|577|273|935|661|47|373|618|2
+\.
+-- project sparse matrix using model leart with grouping_cols
+drop table if exists result_table_214712398172490837;
+drop table if exists result_table_214712398172490837_mean;
+select pca_train('cmat', 'result_table_214712398172490837', 'id', 5, 'grp');
+drop table if exists out_table_214712398172490837;
+select pca_project( 'cmat',
+ 'result_table_214712398172490837',
+ 'out_table_214712398172490837',
+ 'id');