You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2018/04/04 20:50:41 UTC
[2/2] madlib git commit: MLP: Remove minibatch training dependency on original source

MLP: Remove minibatch training dependency on original source

The original source table used in minibatch preprocessor is stored in a
column named 'original_source_table' in the summary table. Ideally, this
table doesn't need to exist after the preprocessed table is created. The
current MLP training code had a dependency on it and would fail if that
table was deleted. This commit removes that dependency.

Co-authored-by: Nikhil Kak <nk...@pivotal.io>


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/1670923b
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/1670923b
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/1670923b

Branch: refs/heads/master
Commit: 1670923bf155395caf51cd7b07edbe9533a6908b
Parents: 29fe759
Author: Nandish Jayaram <nj...@apache.org>
Authored: Tue Apr 3 14:32:03 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Tue Apr 3 17:29:11 2018 -0700

----------------------------------------------------------------------
 src/ports/postgres/modules/convex/mlp_igd.py_in | 22 ++++++++++++++------
 .../postgres/modules/convex/test/mlp.sql_in     |  9 +++++++-
 2 files changed, 24 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/1670923b/src/ports/postgres/modules/convex/mlp_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 800ec29..687011c 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -141,9 +141,14 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
         else:
             num_output_nodes = get_col_dimension(source_table,
                                                  dependent_varname, dim=2)
-        # Get the type of the original source table's dependent variable column.
-        dependent_type = get_expr_type(pp_summary_dict['dependent_varname'],
-                                       pp_summary_dict['source_table'])
+
+        # This variable is used for creating the classes_str column in the model
+        # summary table. We append [] when we create this column in the create
+        # summary table command so we need to strip it out here.
+        dependent_type = get_expr_type(mlp_preprocessor.CLASS_VALUES,
+                                       mlp_preprocessor.summary_table)
+        if dependent_type[-2:] == '[]':
+            dependent_type = dependent_type[:-2]
     else:
         x_mean_table = unique_string(desp='x_mean_table')
         tbl_data_scaled = unique_string(desp="tbl_data_scaled")
@@ -184,6 +189,7 @@ def mlp(schema_madlib, source_table, output_table, independent_varname,
                 dependent_varname = "ARRAY[" + col_dep_var_norm_new + "]"
             num_output_nodes = get_col_dimension(tbl_data_scaled,
                                                  dependent_varname, dim=1)
+
     # Need layers sizes before validating for warm_start
     layer_sizes = [num_input_nodes] + hidden_layer_sizes + [num_output_nodes]
     col_grp_key = unique_string(desp='col_grp_key')
@@ -451,8 +457,10 @@ def _create_summary_table(args):
     if args['warm_start']:
         plpy.execute("DROP TABLE IF EXISTS {0}".format(args['summary_table']))
 
+
     classes_str = PY2SQL([strip_end_quotes(cl, "'") for cl in args['classes']],
                          array_type=args['dependent_type'])
+
     minibatch_summary_col_names = ''
     minibatch_summary_col_vals = ''
     if args['is_minibatch_enabled']:
@@ -678,7 +686,7 @@ def _validate_dependent_var(source_table, dependent_varname,
         if is_classification:
             _assert(("[]" in expr_type \
                      and is_psql_numeric_type(expr_type[:-2]) \
-                     and not _is_dep_var_multi_dim(dependent_varname, source_table) \
+                     and not _get_dep_var_second_dim(dependent_varname, source_table) \
                     ) \
                     or expr_type in classification_types,
                     "Dependent variable column should either be a numeric 1-D"
@@ -687,14 +695,16 @@ def _validate_dependent_var(source_table, dependent_varname,
             _assert("[]" in expr_type or is_psql_numeric_type(expr_type),
                     "Dependent variable column should be of numeric type.")
 
-def _is_dep_var_multi_dim(dependent_varname, source_table):
+def _get_dep_var_second_dim(dependent_varname, source_table):
     # Check if dependent variable is an array of two or higher dimension
+    # Return back the value of the second dimension, returns None if it less
+    # than 2-D.
     dep_array_sec_dim = plpy.execute("""
             SELECT array_upper({0}, 2) AS n_y
             FROM {1}
             LIMIT 1
         """.format(dependent_varname, source_table))
-    return bool(dep_array_sec_dim[0]['n_y'])
+    return dep_array_sec_dim[0]['n_y']
 
 def _validate_params_based_on_minibatch(source_table, independent_varname,
                                         dependent_varname, weights,

http://git-wip-us.apache.org/repos/asf/madlib/blob/1670923b/src/ports/postgres/modules/convex/test/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/test/mlp.sql_in b/src/ports/postgres/modules/convex/test/mlp.sql_in
index 8a2c92b..a40d35a 100644
--- a/src/ports/postgres/modules/convex/test/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/test/mlp.sql_in
@@ -213,8 +213,15 @@ CREATE TABLE iris_data_batch_summary(
     num_rows_skipped integer,
     grouping_cols text
 );
+-- The availability of the original source table should not be a condition for
+-- MLP to work correctly. It should work fine even the original source table is
+-- deleted (this basically ensures that all the necessary info is captured in
+-- the summary table). So name the original source table as
+-- 'iris_data_does_not_exist' instead of the original 'iris_data', to mimic the
+-- scenario where the original source table is deleted and MLP is trained with
+-- the preprocessed table.
 INSERT INTO iris_data_batch_summary VALUES
-('iris_data','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp');
+('iris_data_does_not_exist','iris_data_batch','class::TEXT','attributes',30,ARRAY[1,2,3],141,0,'grp');
 -- Create the corresponding standardization table for preprocessed data
 CREATE TABLE iris_data_batch_standardization(
     grp text,