You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@madlib.apache.org by GitBox <gi...@apache.org> on 2021/02/03 12:10:44 UTC

[GitHub] [madlib] orhankislal commented on a change in pull request #536: DL: Fix bugs when passing invalid input/output tables and other misc bugs

orhankislal commented on a change in pull request #536:
URL: https://github.com/apache/madlib/pull/536#discussion_r569349392



##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras.py_in
##########
@@ -114,56 +115,20 @@ def fit(schema_madlib, source_table, model, model_arch_table,
 
     if object_table is not None:
         object_table = "{0}.{1}".format(schema_madlib, quote_ident(object_table))
-
-    source_summary_table = add_postfix(source_table, "_summary")
-    input_tbl_valid(source_summary_table, module_name)
-    src_summary_dict = get_source_summary_table_dict(source_summary_table)
-
-    columns_dict = {}
-    columns_dict['mb_dep_var_cols'] = src_summary_dict['dependent_varname']
-    columns_dict['mb_indep_var_cols'] = src_summary_dict['independent_varname']
-    columns_dict['dep_shape_cols'] = [add_postfix(i, "_shape") for i in columns_dict['mb_dep_var_cols']]
-    columns_dict['ind_shape_cols'] = [add_postfix(i, "_shape") for i in columns_dict['mb_indep_var_cols']]
-
-    multi_dep_count = len(columns_dict['mb_dep_var_cols'])
-    val_dep_var = None
-    val_ind_var = None
-
-    val_dep_shape_cols = None
-    val_ind_shape_cols = None
-    if validation_table:
-        validation_summary_table = add_postfix(validation_table, "_summary")
-        input_tbl_valid(validation_summary_table, module_name)
-        val_summary_dict = get_source_summary_table_dict(validation_summary_table)
-
-        val_dep_var = val_summary_dict['dependent_varname']
-        val_ind_var = val_summary_dict['independent_varname']
-        val_dep_shape_cols = [add_postfix(i, "_shape") for i in val_dep_var]
-        val_ind_shape_cols = [add_postfix(i, "_shape") for i in val_ind_var]
-
     fit_validator = FitInputValidator(
         source_table, validation_table, model, model_arch_table, model_id,
-        columns_dict['mb_dep_var_cols'], columns_dict['mb_indep_var_cols'],
-        columns_dict['dep_shape_cols'], columns_dict['ind_shape_cols'],
         num_iterations, metrics_compute_frequency, warm_start,
-        use_gpus, accessible_gpus_for_seg, object_table,
-        val_dep_var, val_ind_var)
-
-    columns_dict['val_dep_var'] = val_dep_var
-    columns_dict['val_ind_var'] = val_ind_var
-    columns_dict['val_dep_shape_cols'] = val_dep_shape_cols
-    columns_dict['val_ind_shape_cols'] = val_ind_shape_cols
-
-    fit_validator.dependent_varname = columns_dict['mb_dep_var_cols']
-    fit_validator.independent_varname = columns_dict['mb_indep_var_cols']
-    fit_validator.dep_shape_col = columns_dict['dep_shape_cols']
-    fit_validator.ind_shape_col = columns_dict['ind_shape_cols']
-
-    class_values_colnames = [add_postfix(i, "_class_values") for i in columns_dict['mb_dep_var_cols']]
-    src_summary_dict['class_values_type'] =[ get_expr_type(
-        i, fit_validator.source_summary_table) for i in class_values_colnames]
-    src_summary_dict['norm_const_type'] = get_expr_type(
-        NORMALIZING_CONST_COLNAME, fit_validator.source_summary_table)
+        use_gpus, accessible_gpus_for_seg, object_table)
+
+    multi_dep_count = len(fit_validator.dependent_varname)
+    src_summary_dict = fit_validator.src_summary_dict
+    class_values_colnames = [add_postfix(i, "_class_values") for i in
+                             fit_validator.dependent_varname]
+    #TODO do we need these two keys to be set in src_summary_dict ?

Review comment:
       We don't need the `class_values_type` anymore and it seems `norm_const_type` was not necessary even before multi-io changes.

##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras.py_in
##########
@@ -360,9 +333,10 @@ def fit(schema_madlib, source_table, model, model_arch_table,
     end_training_time = datetime.datetime.now()
 
     version = madlib_version(schema_madlib)
-    class_values_type = src_summary_dict['class_values_type']
+    #TODO do we need these two variables ?

Review comment:
       No

##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
##########
@@ -282,103 +274,117 @@ class FitCommonValidator(object):
         if self.source_table:
             self.source_summary_table = add_postfix(
                 self.source_table, "_summary")
+        if self.validation_table:
+            self.validation_summary_table = add_postfix(
+                self.validation_table, "_summary")
         if self.output_model_table:
             self.output_summary_model_table = add_postfix(
                 self.output_model_table, "_summary")
         self.accessible_gpus_for_seg = accessible_gpus_for_seg
         self.module_name = module_name
-        self.val_dep_var = val_dep_var
-        self.val_ind_var = val_ind_var
 
-        self._validate_common_args()
+        self._validate_tables()
+
+        self.src_summary_dict = self.get_source_summary_table_dict(self.source_summary_table)
+
+        self.dependent_varname = self.src_summary_dict['dependent_varname']
+        self.independent_varname = self.src_summary_dict['independent_varname']
+        if not isinstance(self.dependent_varname, list) or \
+                not isinstance(self.independent_varname, list):
+            #TODO improve error message
+            plpy.error("Input table '{0}' has not been preprocessed. "
+                       "Please run input preprocessor again.".format(self.source_table))

Review comment:
       This is a bit vague, `again` might imply the preprocessor has to be run multiple times to work. `Please run input preprocessor on your dataset before calling training functions.` might be better.

##########
File path: src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
##########
@@ -906,14 +907,14 @@ class MadlibKerasPredictBYOMTestCase(unittest.TestCase):
         self.assertIn('invalid_pred_type', str(error.exception))
 
         # The validation for this test has been disabled

Review comment:
       We should remove this line as well.

##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras.py_in
##########
@@ -1035,7 +1007,6 @@ def get_loss_metric_from_keras_eval(schema_madlib, table, columns_dict, compile_
         weights = '$1'
         mult_sql = ''
         custom_map_var = '$2'
-        plpy.info(eval_sql.format(**locals()))

Review comment:
       Thanks for catching this. I made the same change in the docs PR as well. I'll revert it to avoid a conflict.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org