You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nk...@apache.org on 2021/01/20 23:57:34 UTC
[madlib] branch master updated: DL: Add caching to automl interface
This is an automated email from the ASF dual-hosted git repository.
nkak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new 00c1259 DL: Add caching to automl interface
00c1259 is described below
commit 00c1259152d998a642a0e99a12d98616de5279fd
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Thu Dec 17 17:15:58 2020 -0800
DL: Add caching to automl interface
JIRA: MADLIB-1461
---
.../deep_learning/madlib_keras_automl.py_in | 3 +-
.../deep_learning/madlib_keras_automl.sql_in | 16 +++++-
.../madlib_keras_automl_hyperband.py_in | 20 ++++---
.../madlib_keras_automl_hyperopt.py_in | 19 ++++--
.../madlib_keras_fit_multiple_model.sql_in | 2 +-
.../deep_learning/test/madlib_keras_automl.sql_in | 67 ++++++++++++++++++++++
6 files changed, 110 insertions(+), 17 deletions(-)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
index 1be2db5..b0383f5 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
@@ -63,7 +63,7 @@ class KerasAutoML(object):
model_id_list, compile_params_grid, fit_params_grid, automl_method='hyperband',
automl_params=None, random_state=None, object_table=None,
use_gpus=False, validation_table=None, metrics_compute_frequency=None,
- name=None, description=None, **kwargs):
+ name=None, description=None, use_caching=False, **kwargs):
if is_platform_pg():
plpy.error(
"DL: AutoML is not supported on PostgreSQL.")
@@ -109,6 +109,7 @@ class KerasAutoML(object):
self.metrics_compute_frequency = metrics_compute_frequency
self.name = name
self.description = description
+ self.use_caching = use_caching
if self.validation_table:
AutoMLConstants.LOSS_METRIC = 'validation_loss_final'
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
index 66b1a91..8a41328 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.sql_in
@@ -92,7 +92,8 @@ madlib_keras_automl(
validation_table,
metrics_compute_frequency,
name,
- description
+ description,
+ use_caching
)
</pre>
@@ -278,6 +279,16 @@ madlib_keras_automl(
Free text string to provide a description, if desired.
</DD>
+ <DT>use_caching (optional)</DT>
+ <DD>BOOLEAN, default: FALSE. Use caching of images in memory on the
+ segment in order to speed up processing.
+
+ @note
+ When set to TRUE, image byte arrays on each segment are maintained
+ in cache (GD). This can speed up training significantly, however the
+ memory usage per segment increases. In effect, it
+ requires enough available memory on a segment so that all images
+ residing on that segment can be read into memory.
</dl>
<b>Output tables</b>
@@ -1370,7 +1381,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_automl(
validation_table VARCHAR DEFAULT NULL,
metrics_compute_frequency INTEGER DEFAULT NULL,
name VARCHAR DEFAULT NULL,
- description VARCHAR DEFAULT NULL
+ description VARCHAR DEFAULT NULL,
+ use_caching BOOLEAN DEFAULT FALSE
) RETURNS VOID AS $$
if automl_method is None or automl_method.lower() == 'hyperband':
PythonFunctionBodyOnly(`deep_learning', `madlib_keras_automl_hyperband')
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in
index d4f5211..a3437d8 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperband.py_in
@@ -153,13 +153,13 @@ class AutoMLHyperband(KerasAutoML):
model_id_list, compile_params_grid, fit_params_grid, automl_method,
automl_params, random_state=None, object_table=None,
use_gpus=False, validation_table=None, metrics_compute_frequency=None,
- name=None, description=None, **kwargs):
+ name=None, description=None, use_caching=False, **kwargs):
automl_method = automl_method if automl_method else AutoMLConstants.HYPERBAND
automl_params = automl_params if automl_params else 'R=6, eta=3, skip_last=0'
KerasAutoML.__init__(self, schema_madlib, source_table, model_output_table, model_arch_table,
model_selection_table, model_id_list, compile_params_grid, fit_params_grid,
automl_method, automl_params, random_state, object_table, use_gpus,
- validation_table, metrics_compute_frequency, name, description, **kwargs)
+ validation_table, metrics_compute_frequency, name, description, use_caching, **kwargs)
self.validate_and_define_inputs()
self.create_model_output_table()
self.create_model_output_info_table()
@@ -245,17 +245,23 @@ class AutoMLHyperband(KerasAutoML):
self.reconstruct_temp_mst_table(i, ranges_dict, configs_prune_lookup) # has keys to evaluate
active_keys = plpy.execute("SELECT {ModelSelectionSchema.MST_KEY} " \
"FROM {AutoMLSchema.MST_TABLE}".format(AutoMLSchema=AutoMLConstants,
- ModelSelectionSchema=ModelSelectionSchema))
+ ModelSelectionSchema=ModelSelectionSchema))
for k in active_keys:
i_dict[k[ModelSelectionSchema.MST_KEY]] += 1
self.warm_start = int(i != 0)
mcf = self.metrics_compute_frequency if self._is_valid_metrics_compute_frequency(num_iterations) else None
start_time = time.time()
with SetGUC("plan_cache_mode", "force_generic_plan"):
- model_training = FitMultipleModel(self.schema_madlib, self.source_table, AutoMLConstants.MODEL_OUTPUT_TABLE,
- AutoMLConstants.MST_TABLE, num_iterations, self.use_gpus,
- self.validation_table, mcf, self.warm_start, self.name, self.description,
- metrics_elapsed_time_offset=metrics_elapsed_time_offset)
+ model_training = FitMultipleModel(self.schema_madlib,
+ self.source_table,
+ AutoMLConstants.MODEL_OUTPUT_TABLE,
+ AutoMLConstants.MST_TABLE,
+ num_iterations, self.use_gpus,
+ self.validation_table, mcf,
+ self.warm_start, self.name,
+ self.description,
+ self.use_caching,
+ metrics_elapsed_time_offset)
model_training.fit_multiple_model()
metrics_elapsed_time_offset += time.time() - start_time
self.update_model_output_table()
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in
index b852e14..424cdd1 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl_hyperopt.py_in
@@ -49,13 +49,14 @@ class AutoMLHyperopt(KerasAutoML):
model_id_list, compile_params_grid, fit_params_grid, automl_method,
automl_params, random_state=None, object_table=None,
use_gpus=False, validation_table=None, metrics_compute_frequency=None,
- name=None, description=None, **kwargs):
+ name=None, description=None, use_caching=False, **kwargs):
automl_method = automl_method if automl_method else AutoMLConstants.HYPEROPT
automl_params = automl_params if automl_params else 'num_configs=20, num_iterations=5, algorithm=tpe'
KerasAutoML.__init__(self, schema_madlib, source_table, model_output_table, model_arch_table,
model_selection_table, model_id_list, compile_params_grid, fit_params_grid,
automl_method, automl_params, random_state, object_table, use_gpus,
- validation_table, metrics_compute_frequency, name, description, **kwargs)
+ validation_table, metrics_compute_frequency, name,
+ description, use_caching, **kwargs)
self.compile_params_grid = self.compile_params_grid.replace('\n', '').replace(' ', '')
self.fit_params_grid = self.fit_params_grid.replace('\n', '').replace(' ', '')
try:
@@ -157,10 +158,16 @@ class AutoMLHyperopt(KerasAutoML):
plpy.info("***Evaluating {n} newly suggested model configurations***".format(n=n))
start_time = time.time()
with SetGUC("plan_cache_mode", "force_generic_plan"):
- model_training = FitMultipleModel(self.schema_madlib, self.source_table, AutoMLConstants.MODEL_OUTPUT_TABLE,
- AutoMLConstants.MST_TABLE, self.num_iters, self.use_gpus, self.validation_table,
- self.metrics_compute_frequency, False, self.name, self.description,
- metrics_elapsed_time_offset=metrics_elapsed_time_offset)
+ model_training = FitMultipleModel(self.schema_madlib,
+ self.source_table,
+ AutoMLConstants.MODEL_OUTPUT_TABLE,
+ AutoMLConstants.MST_TABLE,
+ self.num_iters, self.use_gpus,
+ self.validation_table,
+ self.metrics_compute_frequency,
+ False, self.name, self.description,
+ self.use_caching,
+ metrics_elapsed_time_offset)
model_training.fit_multiple_model()
metrics_elapsed_time_offset += time.time() - start_time
if make_mst_summary:
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index b0ac70b..07fb57e 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -239,7 +239,7 @@ madlib_keras_fit_multiple_model(
@note
When set to TRUE, image byte arrays on each segment are maintained
- in cache (SD). This can speed up training significantly, however the
+ in cache (GD). This can speed up training significantly, however the
memory usage per segment increases. In effect, it
requires enough available memory on a segment so that all images
residing on that segment can be read into memory.
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
index cbb6dd0..d4841a9 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_automl.sql_in
@@ -70,6 +70,39 @@ SELECT assert(
normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
) FROM (SELECT * FROM automl_output_summary) summary;
+-- caching test case
+DROP TABLE IF EXISTS automl_output, automl_output_info, automl_output_summary, automl_mst_table,
+automl_mst_table_summary;
+SELECT madlib_keras_automl('iris_data_packed', 'automl_output', 'iris_model_arch', 'automl_mst_table',
+ ARRAY[1], $${'loss': ['categorical_crossentropy'], 'optimizer_params_list': [{'optimizer': ['Adam', 'SGD'],
+ 'lr': [0.01, 0.011, 'log']} ], 'metrics':['accuracy'] }$$, $${'batch_size': [50], 'epochs': [1]}$$,
+ 'hyperopt', 'num_configs=5, num_iterations=6, algorithm=rand', NULL, NULL, FALSE, NULL, 1, 'test1', 'test1 descr', TRUE);
+SELECT assert(
+ source_table = 'iris_data_packed' AND
+ validation_table IS NULL AND
+ model = 'automl_output' AND
+ model_info = 'automl_output_info' AND
+ dependent_varname = 'class_text' AND
+ independent_varname = 'attributes' AND
+ model_arch_table = 'iris_model_arch' AND
+ model_selection_table = 'automl_mst_table' AND
+ automl_method = 'hyperopt' AND
+ automl_params = 'num_configs=5, num_iterations=6, algorithm=rand' AND
+ random_state IS NULL AND
+ object_table IS NULL AND
+ use_gpus = FALSE AND
+ metrics_compute_frequency = 1 AND
+ name = 'test1' AND
+ description = 'test1 descr' AND
+ start_training_time < now() AND
+ end_training_time < now() AND
+ madlib_version IS NOT NULL AND
+ num_classes = 3 AND
+ class_values = '{Iris-setosa,Iris-versicolor,Iris-virginica}' AND
+ dependent_vartype = 'character varying' AND
+ normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
+) FROM (SELECT * FROM automl_output_summary) summary;
+
-- Validate output info table for metrics_iters NOT NULL
SELECT assert(
metrics_iters = ARRAY[1,2,3,4,5,6], 'Invalid metrics_iters value in output info table. Actual:' || __to_char(info)
@@ -331,6 +364,40 @@ SELECT assert(
normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
) FROM (SELECT * FROM automl_output_summary) summary;
+-- caching test case
+DROP TABLE IF EXISTS automl_output, automl_output_info, automl_output_summary, automl_mst_table,
+ automl_mst_table_summary;
+SELECT madlib_keras_automl('iris_data_packed', 'automl_output', 'iris_model_arch', 'automl_mst_table',
+ ARRAY[1,2], $${'loss': ['categorical_crossentropy'], 'optimizer_params_list': [ {'optimizer': ['Adagrad', 'Adam'],
+ 'lr': [0.9, 0.95, 'log'], 'epsilon': [0.3, 0.5, 'log_near_one']}, {'optimizer': ['Adam', 'SGD'], 'lr': [0.6, 0.65, 'log']} ],
+ 'metrics':['accuracy'] }$$, $${'batch_size': [2, 4], 'epochs': [3]}$$, 'hyperband', 'R=5, eta=5, skip_last=1',
+ NULL, NULL, FALSE, NULL, NULL, NULL, NULL, TRUE);
+SELECT assert(
+ source_table = 'iris_data_packed' AND
+ validation_table IS NULL AND
+ model = 'automl_output' AND
+ model_info = 'automl_output_info' AND
+ dependent_varname = 'class_text' AND
+ independent_varname = 'attributes' AND
+ model_arch_table = 'iris_model_arch' AND
+ model_selection_table = 'automl_mst_table' AND
+ automl_method = 'hyperband' AND
+ automl_params = 'R=5, eta=5, skip_last=1' AND
+ random_state IS NULL AND
+ object_table IS NULL AND
+ use_gpus = FALSE AND
+ metrics_compute_frequency = 1 AND
+ name IS NULL AND
+ description IS NULL AND
+ start_training_time < now() AND
+ end_training_time < now() AND
+ madlib_version IS NOT NULL AND
+ num_classes = 3 AND
+ class_values = '{Iris-setosa,Iris-versicolor,Iris-virginica}' AND
+ dependent_vartype = 'character varying' AND
+ normalizing_const = 1, 'Output summary table validation failed. Actual:' || __to_char(summary)
+) FROM (SELECT * FROM automl_output_summary) summary;
+
-- Validate metrics_elapsed_time
-- We know that the schedule for the above automl query looks like
-- s | i | n_i | r_i