You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nk...@apache.org on 2020/03/24 20:06:36 UTC

[madlib] 05/05: DL: Use unlogged tables for fit multiple

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 01ede7c82231acb12e6c4b4234fdbb3ab6633b72
Author: Nikhil Kak <nk...@pivotal.io>
AuthorDate: Tue Mar 10 14:48:42 2020 -0700

    DL: Use unlogged tables for fit multiple
    
    GPDB6 supports UNLOGGED tables. From gpdb documentation
    https://gpdb.docs.pivotal.io/6-0/ref_guide/sql_commands/CREATE_TABLE_AS.html
    
    ```
    Data written to unlogged tables is not written to the write-ahead (WAL)
    log, which makes them considerably faster than ordinary tables
    ```
    
    We saw speed improvements because of using unlogged tables on our gcp
    cluster with GPDB6. But, we do not want the final model output table,
    summary table and the info table to be unlogged. This commit ensures for
    all intermediate iterations, except the very final hop in the last
    iteration, the model output table is created as `UNLOGGED` for
    performance. Only for the last hop in the last iteration, it is created
    as a persistent table.
    
    This commit also adds a test to validate model output, summary and info
    tables are persistent(not unlogged) and adds a new utility function to
    check if the gpdb version is 6 and up.
    
    Co-authored-by: Ekta Khanna <ek...@pivotal.io>
---
 .../deep_learning/madlib_keras_fit_multiple_model.py_in | 15 ++++++++++-----
 .../test/madlib_keras_model_selection.sql_in            |  5 +++++
 .../test/madlib_keras_transfer_learning.sql_in          |  6 ++++++
 src/ports/postgres/modules/utilities/utilities.py_in    |  5 +++++
 src/ports/postgres/modules/utilities/utilities.sql_in   | 17 +++++++++++++++++
 5 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
index c6352b5..9de9774 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.py_in
@@ -35,8 +35,9 @@ from madlib_keras_wrapper import *
 from utilities.control import MinWarning
 from utilities.control import OptimizerControl
 from utilities.control import SetGUC
-from utilities.utilities import unique_string
 from utilities.utilities import add_postfix
+from utilities.utilities import is_platform_gp6_or_up
+from utilities.utilities import unique_string
 from utilities.utilities import rotate
 from utilities.utilities import madlib_version
 from utilities.utilities import is_platform_pg
@@ -181,6 +182,7 @@ class FitMultipleModel():
         random.shuffle(self.msts_for_schedule)
         self.grand_schedule = self.generate_schedule(self.msts_for_schedule)
         self.gp_segment_id_col = '0' if is_platform_pg() else GP_SEGMENT_ID_COLNAME
+        self.unlogged_table = "UNLOGGED" if is_platform_gp6_or_up() else ''
 
         if self.warm_start:
             self.create_model_output_table_warm_start()
@@ -224,6 +226,7 @@ class FitMultipleModel():
                 mst_row = [self.grand_schedule[dist_key][mst_idx]
                            for dist_key in self.dist_keys]
                 self.create_mst_schedule_table(mst_row)
+                self.is_final_training_call = (iter == self.num_iterations and mst_idx == total_msts-1)
                 if mst_idx == 0:
                     start_iteration = time.time()
                 self.run_training(mst_idx)
@@ -286,7 +289,7 @@ class FitMultipleModel():
 
     def create_mst_schedule_table(self, mst_row):
         mst_temp_query = """
-                         CREATE TABLE {self.mst_current_schedule_tbl}
+                         CREATE {self.unlogged_table} TABLE {self.mst_current_schedule_tbl}
                                 ({self.model_id_col} INTEGER,
                                  {self.compile_params_col} VARCHAR,
                                  {self.fit_params_col} VARCHAR,
@@ -556,7 +559,7 @@ class FitMultipleModel():
         # that was used for initializing GPU memory.
         use_gpus = self.use_gpus if self.use_gpus else False
         mst_weights_query = """
-            CREATE TABLE {self.mst_weights_tbl} AS
+            CREATE {self.unlogged_table} TABLE {self.mst_weights_tbl} AS
                 SELECT mst_tbl.*, wgh_tbl.{self.model_weights_col},
                        model_arch_tbl.{self.model_arch_col}
                 FROM
@@ -571,7 +574,7 @@ class FitMultipleModel():
         plpy.execute(mst_weights_query)
         use_gpus = self.use_gpus if self.use_gpus else False
         uda_query = """
-            CREATE TABLE {self.weights_to_update_tbl} AS
+            CREATE {self.unlogged_table} TABLE {self.weights_to_update_tbl} AS
             SELECT {self.schema_madlib}.fit_step_multiple_model({mb_dep_var_col},
                 {mb_indep_var_col},
                 {self.dep_shape_col},
@@ -648,9 +651,11 @@ class FitMultipleModel():
 
         with SetGUC("dev_opt_unsafe_truncate_in_subtransaction", "on"):
             temp_model_table = unique_string('updated_model')
+            unlogged_table = self.unlogged_table if not self.is_final_training_call else ''
             plpy.execute("""
-            CREATE TABLE {temp_model_table} ( LIKE {self.model_output_table}
+            CREATE {unlogged_table} TABLE {temp_model_table} ( LIKE {self.model_output_table}
             INCLUDING indexes);""".format(temp_model_table=temp_model_table,
+                                          unlogged_table=unlogged_table,
                                           self=self))
             plpy.execute("""
             INSERT INTO {temp_model_table} SELECT * FROM {self.model_output_table};
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras_model_selection.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras_model_selection.sql_in
index 391beb7..e1dbe0c 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras_model_selection.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_model_selection.sql_in
@@ -210,6 +210,11 @@ SELECT assert(
         'Keras Fit Multiple Output Summary Validation failed when user passes in 1-hot encoded label vector. Actual:' || __to_char(summary))
 FROM (SELECT * FROM iris_multiple_model_summary) summary;
 
+-- Test the output table created are all persistent(not unlogged)
+SELECT assert(MADLIB_SCHEMA.is_table_unlogged('iris_multiple_model') = false, 'Model output table is unlogged');
+SELECT assert(MADLIB_SCHEMA.is_table_unlogged('iris_multiple_model_summary') = false, 'Model summary output table is unlogged');
+SELECT assert(MADLIB_SCHEMA.is_table_unlogged('iris_multiple_model_info') = false, 'Model info output table is unlogged');
+
 -- Test when number of configs(3) equals number of segments(3)
 DROP TABLE IF EXISTS iris_multiple_model, iris_multiple_model_summary, iris_multiple_model_info;
 SELECT setseed(0);
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras_transfer_learning.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras_transfer_learning.sql_in
index 4334fa6..43ad38b 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras_transfer_learning.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras_transfer_learning.sql_in
@@ -173,6 +173,12 @@ SELECT madlib_keras_fit_multiple_model(
   TRUE -- warm_start
 );
 
+-- Test that when warm_start is TRUE, all the output tables are persistent(not unlogged)
+SELECT assert(MADLIB_SCHEMA.is_table_unlogged('iris_multiple_model') = false, 'Model output table is unlogged');
+SELECT assert(MADLIB_SCHEMA.is_table_unlogged('iris_multiple_model_summary') = false, 'Model summary output table is unlogged');
+SELECT assert(MADLIB_SCHEMA.is_table_unlogged('iris_multiple_model_info') = false, 'Model info output table is unlogged');
+
+
 SELECT assert(
   array_upper(training_loss, 1) = 3 AND
   array_upper(training_metrics, 1) = 3,
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index fca3a6b..577235e 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -50,6 +50,11 @@ def has_function_properties():
 def is_platform_pg():
     """ __POSTGRESQL__ variable defined during configure """
     return m4_ifdef(<!__POSTGRESQL__!>, <!True!>, <!False!>)
+
+def is_platform_gp6_or_up():
+    version_wrapper = __mad_version()
+    return not is_platform_pg() and not version_wrapper.is_gp_version_less_than('6.0')
+
 # ------------------------------------------------------------------------------
 
 
diff --git a/src/ports/postgres/modules/utilities/utilities.sql_in b/src/ports/postgres/modules/utilities/utilities.sql_in
index 9e5deee..bbf861d 100644
--- a/src/ports/postgres/modules/utilities/utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/utilities.sql_in
@@ -574,3 +574,20 @@ if actual_guc_value != expected_guc_value:
                'Actual Value: {2}'.format(guc_name, expected_guc_value, actual_guc_value))
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.is_table_unlogged(
+  tbl_name TEXT
+)
+RETURNS boolean AS $$
+DECLARE
+ret_val   boolean;
+BEGIN
+    BEGIN
+        SELECT relpersistence='u' INTO ret_val FROM pg_class WHERE relname = tbl_name;
+    EXCEPTION
+        WHEN OTHERS THEN
+            RETURN false; -- Traps exception and returns false, for GP versions < 6 as relpersistence is not a col in pg_class
+    END;
+    RETURN ret_val;
+END;
+$$ LANGUAGE plpgsql;