You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2018/04/04 20:50:41 UTC
[2/2] madlib git commit: MLP: Remove minibatch training dependency on
original source
MLP: Remove minibatch training dependency on original source
The original source table used in minibatch preprocessor is stored in a
column named 'original_source_table' in the summary table. Ideally, this
table doesn't need to exist after the preprocessed table is created. The
current MLP training code had a dependency on it and would fail if that
table was deleted. This commit removes that dependency.
Co-authored-by: Nikhil Kak <nk...@pivotal.io>
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/1670923b
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/1670923b
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/1670923b
Branch: refs/heads/master
Commit: 1670923bf155395caf51cd7b07edbe9533a6908b
Parents: 29fe759
Author: Nandish Jayaram <nj...@apache.org>
Authored: Tue Apr 3 14:32:03 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Tue Apr 3 17:29:11 2018 -0700
----------------------------------------------------------------------
src/ports/postgres/modules/convex/mlp_igd.py_in | 22 ++++++++++++++------
.../postgres/modules/convex/test/mlp.sql_in | 9 +++++++-
2 files changed, 24 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/1670923b/src/ports/postgres/modules/convex/mlp_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 800ec29..687011c 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -141,9 +141,14 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
else:
num_output_nodes = get_col_dimension(source_table,
dependent_varname, dim=2)
- # Get the type of the original source table's dependent variable column.
- dependent_type = get_expr_type(pp_summary_dict['dependent_varname'],
- pp_summary_dict['source_table'])
+
+ # This variable is used for creating the classes_str column in the model
+ # summary table. We append [] when we create this column in the create
+ # summary table command so we need to strip it out here.
+ dependent_type = get_expr_type(mlp_preprocessor.CLASS_VALUES,
+ mlp_preprocessor.summary_table)
+ if dependent_type[-2:] == '[]':
+ dependent_type = dependent_type[:-2]
else:
x_mean_table = unique_string(desp='x_mean_table')
tbl_data_scaled = unique_string(desp="tbl_data_scaled")
@@ -184,6 +189,7 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
dependent_varname = "ARRAY[" + col_dep_var_norm_new + "]"
num_output_nodes = get_col_dimension(tbl_data_scaled,
dependent_varname, dim=1)
+
# Need layers sizes before validating for warm_start
layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes]
col_grp_key = unique_string(desp='col_grp_key')
@@ -451,8 +457,10 @@ def _create_summary_table(args):
if args['warm_start']:
plpy.execute("DROP TABLE IF EXISTS {0}".format(args['summary_table']))
+
classes_str = PY2SQL([strip_end_quotes(cl, "'") for cl in args['classes']],
array_type=args['dependent_type'])
+
minibatch_summary_col_names = ''
minibatch_summary_col_vals = ''
if args['is_minibatch_enabled']:
@@ -678,7 +686,7 @@ def _validate_dependent_var(source_table, dependent_varname,
if is_classification:
_assert(("[]" in expr_type \
and is_psql_numeric_type(expr_type[:-2]) \
- and not _is_dep_var_multi_dim(dependent_varname, source_table) \
+ and not _get_dep_var_second_dim(dependent_varname, source_table) \
) \
or expr_type in classification_types,
"Dependent variable column should either be a numeric 1-D"
@@ -687,14 +695,16 @@ def _validate_dependent_var(source_table, dependent_varname,
_assert("[]" in expr_type or is_psql_numeric_type(expr_type),
"Dependent variable column should be of numeric type.")
-def _is_dep_var_multi_dim(dependent_varname, source_table):
+def _get_dep_var_second_dim(dependent_varname, source_table):
# Check if dependent variable is an array of two or higher dimension
+ # Return back the value of the second dimension, returns None if it less
+ # than 2-D.
dep_array_sec_dim = plpy.execute("""
SELECT array_upper({0}, 2) AS n_y
FROM {1}
LIMIT 1
""".format(dependent_varname, source_table))
- return bool(dep_array_sec_dim[0]['n_y'])
+ return dep_array_sec_dim[0]['n_y']
def _validate_params_based_on_minibatch(source_table, independent_varname,
dependent_varname, weights,
http://git-wip-us.apache.org/repos/asf/madlib/blob/1670923b/src/ports/postgres/modules/convex/test/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in
index 8a2c92b..a40d35a 100644
--- a/src/ports/postgres/modules/convex/test/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/test/mlp.sql_in
@@ -213,8 +213,15 @@ CREATE TABLE iris_data_batch_summary(
num_rows_skipped integer,
grouping_cols text
);
+-- The availability of the original source table should not be a condition for
+-- MLP to work correctly. It should work fine even the original source table is
+-- deleted (this basically ensures that all the necessary info is captured in
+-- the summary table). So name the original source table as
+-- 'iris_data_does_not_exist' instead of the original 'iris_data', to mimic the
+-- scenario where the original source table is deleted and MLP is trained with
+-- the preprocessed table.
INSERT INTO iris_data_batch_summary VALUES
-('iris_data','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp');
+('iris_data_does_not_exist','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp');
-- Create the corresponding standardization table for preprocessed data
CREATE TABLE iris_data_batch_standardization(
grp text,