You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by kh...@apache.org on 2020/10/27 20:18:09 UTC

[madlib] 04/08: DL: [AutoML] Add new class for Distribution rules

This is an automated email from the ASF dual-hosted git repository.

khannaekta pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 2d6e599bf9ab393c0e8c6b6a81b781a1a4e1088c
Author: Ekta Khanna <ek...@vmware.com>
AuthorDate: Fri Oct 9 17:32:26 2020 -0700

    DL: [AutoML] Add new class for Distribution rules
    
    JIRA: MADLIB-1453
    
    Co-authored-by: Nikhil Kak <nk...@vmware.com>
---
 .../deep_learning/input_data_preprocessor.py_in    | 17 ++++++++-------
 .../deep_learning/madlib_keras_automl.py_in        |  6 +++---
 .../deep_learning/madlib_keras_helper.py_in        |  3 +--
 .../deep_learning/madlib_keras_validator.py_in     |  1 +
 .../test/unit_tests/test_madlib_keras.py_in        | 24 ++++++++++++++--------
 .../test/unit_tests/test_madlib_keras_automl.py_in |  8 ++++----
 6 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index 1d395a6..4b27642 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -51,6 +51,9 @@ from madlib_keras_helper import *
 import time
 
 NUM_CLASSES_COLNAME = "num_classes"
+class DistributionRulesOptions:
+    ALL_SEGMENTS = 'all_segments'
+    GPU_SEGMENTS = 'gpu_segments'
 
 class InputDataPreprocessorDL(object):
     def __init__(self, schema_madlib, source_table, output_table,
@@ -64,12 +67,12 @@ class InputDataPreprocessorDL(object):
         self.buffer_size = buffer_size
         self.normalizing_const = normalizing_const
         self.num_classes = num_classes
-        self.distribution_rules = distribution_rules if distribution_rules else 'all_segments'
+        self.distribution_rules = distribution_rules.lower() if distribution_rules else DistributionRulesOptions.ALL_SEGMENTS
         self.module_name = module_name
         self.output_summary_table = None
         self.dependent_vartype = None
         self.independent_vartype = None
-        self.gpu_config = '$__madlib__$all_segments$__madlib__$'
+        self.gpu_config = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS)
         if self.output_table:
             self.output_summary_table = add_postfix(self.output_table, "_summary")
 
@@ -269,7 +272,7 @@ class InputDataPreprocessorDL(object):
 
         if is_platform_pg():
             # used later for writing summary table
-            self.distribution_rules = '$__madlib__$all_segments$__madlib__$'
+            self.distribution_rules = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS)
 
             #
             # For postgres, we just need 3 simple queries:
@@ -320,14 +323,14 @@ class InputDataPreprocessorDL(object):
         #   it's to be spread evenly across all segments, we still
         #   need to do some extra work to ensure that happens.
 
-        if self.distribution_rules == 'all_segments':
+        if self.distribution_rules == DistributionRulesOptions.ALL_SEGMENTS:
             all_segments = True
-            self.distribution_rules = '$__madlib__$all_segments$__madlib__$'
+            self.distribution_rules = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS)
             num_segments = get_seg_number()
         else:
             all_segments = False
 
-        if self.distribution_rules == 'gpu_segments':
+        if self.distribution_rules == DistributionRulesOptions.GPU_SEGMENTS:
             #TODO can we reuse the function `get_accessible_gpus_for_seg` from
             # madlib_keras_helper
             gpu_info_table = unique_string(desp='gpu_info')
@@ -620,7 +623,7 @@ class InputDataPreprocessorDL(object):
                        normalizing_const_colname=NORMALIZING_CONST_COLNAME,
                        num_classes_colname=NUM_CLASSES_COLNAME,
                        internal_gpu_config=INTERNAL_GPU_CONFIG,
-                       distribution_rules=DISTRIBUTION_RULES,
+                       distribution_rules=DISTRIBUTION_RULES_COLNAME,
                        FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE)
         plpy.execute(query)
 
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
index 0df6772..dc8c837 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
@@ -33,7 +33,7 @@ from utilities.utilities import get_current_timestamp, get_seg_number, get_segme
 from utilities.control import SetGUC
 from madlib_keras_fit_multiple_model import FitMultipleModel
 from madlib_keras_helper import generate_row_string
-from madlib_keras_helper import DISTRIBUTION_RULES
+from madlib_keras_helper import DISTRIBUTION_RULES_COLNAME
 from madlib_keras_model_selection import MstSearch, ModelSelectionSchema
 from keras_model_arch_table import ModelArchSchema
 from utilities.validate_args import table_exists, drop_tables, input_tbl_valid
@@ -706,7 +706,7 @@ class AutoMLHyperopt(KerasAutoML):
         :return:
         """
         source_summary_table = add_postfix(self.source_table, '_summary')
-        dist_rules = plpy.execute("SELECT {0} from {1}".format(DISTRIBUTION_RULES, source_summary_table))[0][DISTRIBUTION_RULES]
+        dist_rules = plpy.execute("SELECT {0} from {1}".format(DISTRIBUTION_RULES_COLNAME, source_summary_table))[0][DISTRIBUTION_RULES_COLNAME]
         #TODO create constant for all_segments
         if dist_rules == "all_segments":
             return get_seg_number()
@@ -734,9 +734,9 @@ class AutoMLHyperopt(KerasAutoML):
                     self.algorithm = rand
                 elif automl_params_dict[AutoMLConstants.ALGORITHM].lower() == 'tpe':
                     self.algorithm = tpe
+                # TODO: Add support for atpe uncomment the below lines after atpe works
                 # elif automl_params_dict[AutoMLSchema.ALGORITHM].lower() == 'atpe':
                 #     self.algorithm = atpe
-                # uncomment the above lines after atpe works # TODO
                 else:
                     plpy.error("{0}: valid algorithm 'automl_params' for hyperopt: 'rand', 'tpe'".format(self.module_name)) # , or 'atpe'
             else:
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
index 96c2817..be9a1f9 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
@@ -26,7 +26,6 @@ from utilities.validate_args import table_exists
 from madlib_keras_gpu_info import GPUInfoFunctions
 import plpy
 from math import isnan
-# from madlib_keras_model_selection import ModelSelectionSchema
 
 ############### Constants used in other deep learning files #########
 # Name of columns in model summary table.
@@ -54,7 +53,7 @@ SMALLINT_SQL_TYPE = 'SMALLINT'
 DEFAULT_NORMALIZING_CONST = 1.0
 GP_SEGMENT_ID_COLNAME = "gp_segment_id"
 INTERNAL_GPU_CONFIG = '__internal_gpu_config__'
-DISTRIBUTION_RULES = "distribution_rules"
+DISTRIBUTION_RULES_COLNAME = "distribution_rules"
 #####################################################################
 
 # Prepend a dimension to np arrays using expand_dims.
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index 8b2157d..41e4c72 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
@@ -18,6 +18,7 @@
 # under the License.
 
 import plpy
+from input_data_preprocessor import DistributionRulesOptions
 from keras_model_arch_table import ModelArchSchema
 from model_arch_info import get_num_classes
 from madlib_keras_custom_function import CustomFunctionSchema
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index e69bab4..13bbfd1 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -51,7 +51,8 @@ class MadlibKerasFitTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -691,7 +692,8 @@ class InternalKerasPredictTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -795,7 +797,8 @@ class MadlibKerasPredictBYOMTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -877,7 +880,8 @@ class MadlibKerasWrapperTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -1210,7 +1214,8 @@ class MadlibKerasFitCommonValidatorTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -1262,7 +1267,8 @@ class InputValidatorTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -1382,7 +1388,8 @@ class MadlibSerializerTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -1585,7 +1592,8 @@ class MadlibKerasEvaluationTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
index edb12c4..946dde3 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
@@ -37,7 +37,8 @@ class HyperbandScheduleTestCase(unittest.TestCase):
         # tested here. They are tested in dev-check.
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()
@@ -206,15 +207,14 @@ class HyperbandScheduleTestCase(unittest.TestCase):
     def tearDown(self):
         self.module_patcher.stop()
 
-
-
 class AutoMLHyperoptTestCase(unittest.TestCase):
     def setUp(self):
         # The side effects of this class(writing to the output table) are not
         # tested here. They are tested in dev-check.
         self.plpy_mock = Mock(spec='error')
         patches = {
-            'plpy': plpy
+            'plpy': plpy,
+            'utilities.mean_std_dev_calculator': Mock()
         }
 
         self.plpy_mock_execute = MagicMock()