You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nk...@apache.org on 2021/01/20 23:57:34 UTC
[madlib] branch master updated: DL: Add caching to automl interface

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
     new 00c1259  DL: Add caching to automl interface
00c1259 is described below

commit 00c1259152d998a642a0e99a12d98616de5279fd
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Thu Dec 17 17:15:58 2020 -0800

    DL: Add caching to automl interface
    
    JIRA: MADLIB-1461
---
 .../deep_learning/madlib_keras_automl.py_in        |  3 +-
 .../deep_learning/madlib_keras_automl.sql_in       | 16 +++++-
 .../madlib_keras_automl_hyperband.py_in            | 20 ++++---
 .../madlib_keras_automl_hyperopt.py_in             | 19 ++++--
 .../madlib_keras_fit_multiple_model.sql_in         |  2 +-
 .../deep_learning/test/madlib_keras_automl.sql_in  | 67 ++++++++++++++++++++++
 6 files changed, 110 insertions(+), 17 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
index 1be2db5..b0383f5 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
@@ -63,7 +63,7 @@ class KerasAutoML(object):
                  model_id_list, compile_params_grid, fit_params_grid, automl_method='hyperband',
                  automl_params=None, random_state=None, object_table=None,
                  use_gpus=False, validation_table=None, metrics_compute_frequency=None,
-                 name=None, description=None, **kwargs):
+                 name=None, description=None, use_caching=False, **kwargs):
         if is_platform_pg():
             plpy.error(
                 "DL: AutoML is not supported on PostgreSQL.")
@@ -109,6 +109,7 @@ class KerasAutoML(object):
         self.metrics_compute_frequency = metrics_compute_frequency
         self.name = name
         self.description = description
+        self.use_caching = use_caching
 
         if self.validation_table:
             AutoMLConstants.LOSS_METRIC = 'validation_loss_final'
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
index 66b1a91..8a41328 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
@@ -92,7 +92,8 @@ madlib_keras_automl(
     validation_table,
     metrics_compute_frequency,
     name,
-    description
+    description,
+    use_caching
     )
 </pre>
 
@@ -278,6 +279,16 @@ madlib_keras_automl(
     Free text string to provide a description, if desired.
   </DD>
 
+  <DT>use_caching (optional)</DT>
+  <DD>BOOLEAN, default: FALSE. Use caching of images in memory on the
+  segment in order to speed up processing.
+
+  @note
+  When set to TRUE, image byte arrays on each segment are maintained
+  in cache (GD). This can speed up training significantly, however the
+  memory usage per segment increases.  In effect, it
+  requires enough available memory on a segment so that all images
+  residing on that segment can be read into memory.
 </dl>
 
 <b>Output tables</b>
@@ -1370,7 +1381,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_automl(
     validation_table               VARCHAR DEFAULT NULL,
     metrics_compute_frequency      INTEGER DEFAULT NULL,
     name                           VARCHAR DEFAULT NULL,
-    description                    VARCHAR DEFAULT NULL
+    description                    VARCHAR DEFAULT NULL,
+    use_caching                    BOOLEAN DEFAULT FALSE
 ) RETURNS VOID AS $$
 if automl_method is None or automl_method.lower() == 'hyperband':
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_automl_hyperband')
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in
index d4f5211..a3437d8 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in
@@ -153,13 +153,13 @@ class AutoMLHyperband(KerasAutoML):
                  model_id_list, compile_params_grid, fit_params_grid, automl_method,
                  automl_params, random_state=None, object_table=None,
                  use_gpus=False, validation_table=None, metrics_compute_frequency=None,
-                 name=None, description=None, **kwargs):
+                 name=None, description=None, use_caching=False, **kwargs):
         automl_method = automl_method if automl_method else AutoMLConstants.HYPERBAND
         automl_params = automl_params if automl_params else 'R=6, eta=3, skip_last=0'
         KerasAutoML.__init__(self, schema_madlib, source_table, model_output_table, model_arch_table,
                              model_selection_table, model_id_list, compile_params_grid, fit_params_grid,
                              automl_method, automl_params, random_state, object_table, use_gpus,
-                             validation_table, metrics_compute_frequency, name, description, **kwargs)
+                             validation_table, metrics_compute_frequency, name, description, use_caching, **kwargs)
         self.validate_and_define_inputs()
         self.create_model_output_table()
         self.create_model_output_info_table()
@@ -245,17 +245,23 @@ class AutoMLHyperband(KerasAutoML):
             self.reconstruct_temp_mst_table(i, ranges_dict, configs_prune_lookup) # has keys to evaluate
             active_keys = plpy.execute("SELECT {ModelSelectionSchema.MST_KEY} " \
                                        "FROM {AutoMLSchema.MST_TABLE}".format(AutoMLSchema=AutoMLConstants,
-                                                                                   ModelSelectionSchema=ModelSelectionSchema))
+                                                                              ModelSelectionSchema=ModelSelectionSchema))
             for k in active_keys:
                 i_dict[k[ModelSelectionSchema.MST_KEY]] += 1
             self.warm_start = int(i != 0)
             mcf = self.metrics_compute_frequency if self._is_valid_metrics_compute_frequency(num_iterations) else None
             start_time = time.time()
             with SetGUC("plan_cache_mode", "force_generic_plan"):
-                model_training = FitMultipleModel(self.schema_madlib, self.source_table, AutoMLConstants.MODEL_OUTPUT_TABLE,
-                                                AutoMLConstants.MST_TABLE, num_iterations, self.use_gpus,
-                                                self.validation_table, mcf, self.warm_start, self.name, self.description,
-						 metrics_elapsed_time_offset=metrics_elapsed_time_offset)
+                model_training = FitMultipleModel(self.schema_madlib,
+                                                  self.source_table,
+                                                  AutoMLConstants.MODEL_OUTPUT_TABLE,
+                                                  AutoMLConstants.MST_TABLE,
+                                                  num_iterations, self.use_gpus,
+                                                  self.validation_table, mcf,
+                                                  self.warm_start, self.name,
+                                                  self.description,
+                                                  self.use_caching,
+                                                  metrics_elapsed_time_offset)
                 model_training.fit_multiple_model()
             metrics_elapsed_time_offset += time.time() - start_time
             self.update_model_output_table()
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in
index b852e14..424cdd1 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in
@@ -49,13 +49,14 @@ class AutoMLHyperopt(KerasAutoML):
                  model_id_list, compile_params_grid, fit_params_grid, automl_method,
                  automl_params, random_state=None, object_table=None,
                  use_gpus=False, validation_table=None, metrics_compute_frequency=None,
-                 name=None, description=None, **kwargs):
+                 name=None, description=None, use_caching=False, **kwargs):
         automl_method = automl_method if automl_method else AutoMLConstants.HYPEROPT
         automl_params = automl_params if automl_params else 'num_configs=20, num_iterations=5, algorithm=tpe'
         KerasAutoML.__init__(self, schema_madlib, source_table, model_output_table, model_arch_table,
                              model_selection_table, model_id_list, compile_params_grid, fit_params_grid,
                              automl_method, automl_params, random_state, object_table, use_gpus,
-                             validation_table, metrics_compute_frequency, name, description, **kwargs)
+                             validation_table, metrics_compute_frequency, name,
+                             description, use_caching, **kwargs)
         self.compile_params_grid = self.compile_params_grid.replace('\n', '').replace(' ', '')
         self.fit_params_grid = self.fit_params_grid.replace('\n', '').replace(' ', '')
         try:
@@ -157,10 +158,16 @@ class AutoMLHyperopt(KerasAutoML):
             plpy.info("***Evaluating {n} newly suggested model configurations***".format(n=n))
             start_time = time.time()
             with SetGUC("plan_cache_mode", "force_generic_plan"):
-                model_training = FitMultipleModel(self.schema_madlib, self.source_table, AutoMLConstants.MODEL_OUTPUT_TABLE,
-                                                  AutoMLConstants.MST_TABLE, self.num_iters, self.use_gpus, self.validation_table,
-                                                  self.metrics_compute_frequency, False, self.name, self.description,
-                                                  metrics_elapsed_time_offset=metrics_elapsed_time_offset)
+                model_training = FitMultipleModel(self.schema_madlib,
+                                                  self.source_table,
+                                                  AutoMLConstants.MODEL_OUTPUT_TABLE,
+                                                  AutoMLConstants.MST_TABLE,
+                                                  self.num_iters, self.use_gpus,
+                                                  self.validation_table,
+                                                  self.metrics_compute_frequency,
+                                                  False, self.name, self.description,
+                                                  self.use_caching,
+                                                  metrics_elapsed_time_offset)
                 model_training.fit_multiple_model()
             metrics_elapsed_time_offset += time.time() - start_time
             if make_mst_summary:
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index b0ac70b..07fb57e 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -239,7 +239,7 @@ madlib_keras_fit_multiple_model(
 
   @note
   When set to TRUE, image byte arrays on each segment are maintained 
-  in cache (SD). This can speed up training significantly, however the 
+  in cache (GD). This can speed up training significantly, however the
   memory usage per segment increases.  In effect, it 
   requires enough available memory on a segment so that all images 
   residing on that segment can be read into memory.
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
index cbb6dd0..d4841a9 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
@@ -70,6 +70,39 @@ SELECT assert(
     normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
 ) FROM (SELECT * FROM automl_output_summary) summary;
 
+-- caching test case
+DROP TABLE IF EXISTS automl_output, automl_output_info, automl_output_summary, automl_mst_table,
+automl_mst_table_summary;
+SELECT madlib_keras_automl('iris_data_packed', 'automl_output', 'iris_model_arch', 'automl_mst_table',
+    ARRAY[1], $${'loss': ['categorical_crossentropy'], 'optimizer_params_list': [{'optimizer': ['Adam', 'SGD'],
+    'lr': [0.01, 0.011, 'log']} ], 'metrics':['accuracy'] }$$, $${'batch_size': [50], 'epochs': [1]}$$,
+    'hyperopt', 'num_configs=5, num_iterations=6, algorithm=rand', NULL, NULL, FALSE, NULL, 1, 'test1', 'test1 descr', TRUE);
+SELECT assert(
+    source_table = 'iris_data_packed' AND
+    validation_table IS NULL AND
+    model = 'automl_output' AND
+    model_info = 'automl_output_info' AND
+    dependent_varname = 'class_text' AND
+    independent_varname = 'attributes' AND
+    model_arch_table = 'iris_model_arch' AND
+    model_selection_table = 'automl_mst_table' AND
+    automl_method = 'hyperopt' AND
+    automl_params = 'num_configs=5, num_iterations=6, algorithm=rand' AND
+    random_state IS NULL AND
+    object_table IS NULL AND
+    use_gpus = FALSE AND
+    metrics_compute_frequency = 1 AND
+    name = 'test1' AND
+    description = 'test1 descr' AND
+    start_training_time < now() AND
+    end_training_time < now() AND
+    madlib_version IS NOT NULL AND
+    num_classes = 3 AND
+    class_values = '{Iris-setosa,Iris-versicolor,Iris-virginica}' AND
+    dependent_vartype = 'character varying' AND
+    normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
+) FROM (SELECT * FROM automl_output_summary) summary;
+
 -- Validate output info table for metrics_iters NOT NULL
 SELECT assert(
     metrics_iters = ARRAY[1,2,3,4,5,6], 'Invalid metrics_iters value in output info table. Actual:' || __to_char(info)
@@ -331,6 +364,40 @@ SELECT assert(
     normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
 ) FROM (SELECT * FROM automl_output_summary) summary;
 
+-- caching test case
+DROP TABLE IF EXISTS automl_output, automl_output_info, automl_output_summary, automl_mst_table,
+    automl_mst_table_summary;
+SELECT madlib_keras_automl('iris_data_packed', 'automl_output', 'iris_model_arch', 'automl_mst_table',
+	ARRAY[1,2], $${'loss': ['categorical_crossentropy'], 'optimizer_params_list': [ {'optimizer': ['Adagrad', 'Adam'],
+	'lr': [0.9, 0.95, 'log'], 'epsilon': [0.3, 0.5, 'log_near_one']}, {'optimizer': ['Adam', 'SGD'], 'lr': [0.6, 0.65, 'log']} ],
+	'metrics':['accuracy'] }$$, $${'batch_size': [2, 4], 'epochs': [3]}$$, 'hyperband', 'R=5, eta=5, skip_last=1',
+	NULL, NULL, FALSE, NULL, NULL, NULL, NULL, TRUE);
+SELECT assert(
+    source_table = 'iris_data_packed' AND
+    validation_table IS NULL AND
+    model = 'automl_output' AND
+    model_info = 'automl_output_info' AND
+    dependent_varname = 'class_text' AND
+    independent_varname = 'attributes' AND
+    model_arch_table = 'iris_model_arch' AND
+    model_selection_table = 'automl_mst_table' AND
+    automl_method = 'hyperband' AND
+    automl_params = 'R=5, eta=5, skip_last=1' AND
+    random_state IS NULL AND
+    object_table IS NULL AND
+    use_gpus = FALSE AND
+    metrics_compute_frequency = 1 AND
+    name IS NULL AND
+    description IS NULL AND
+    start_training_time < now() AND
+    end_training_time < now() AND
+    madlib_version IS NOT NULL AND
+    num_classes = 3 AND
+    class_values = '{Iris-setosa,Iris-versicolor,Iris-virginica}' AND
+    dependent_vartype = 'character varying' AND
+    normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
+) FROM (SELECT * FROM automl_output_summary) summary;
+
 -- Validate metrics_elapsed_time
 -- We know that the schedule for the above automl query looks like
 --  s | i | n_i | r_i