You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by kh...@apache.org on 2020/10/27 20:18:09 UTC
[madlib] 04/08: DL: [AutoML] Add new class for Distribution rules
This is an automated email from the ASF dual-hosted git repository.
khannaekta pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 2d6e599bf9ab393c0e8c6b6a81b781a1a4e1088c
Author: Ekta Khanna <ek...@vmware.com>
AuthorDate: Fri Oct 9 17:32:26 2020 -0700
DL: [AutoML] Add new class for Distribution rules
JIRA: MADLIB-1453
Co-authored-by: Nikhil Kak <nk...@vmware.com>
---
.../deep_learning/input_data_preprocessor.py_in | 17 ++++++++-------
.../deep_learning/madlib_keras_automl.py_in | 6 +++---
.../deep_learning/madlib_keras_helper.py_in | 3 +--
.../deep_learning/madlib_keras_validator.py_in | 1 +
.../test/unit_tests/test_madlib_keras.py_in | 24 ++++++++++++++--------
.../test/unit_tests/test_madlib_keras_automl.py_in | 8 ++++----
6 files changed, 35 insertions(+), 24 deletions(-)
diff --git a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index 1d395a6..4b27642 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -51,6 +51,9 @@ from madlib_keras_helper import *
import time
NUM_CLASSES_COLNAME = "num_classes"
+class DistributionRulesOptions:
+ ALL_SEGMENTS = 'all_segments'
+ GPU_SEGMENTS = 'gpu_segments'
class InputDataPreprocessorDL(object):
def __init__(self, schema_madlib, source_table, output_table,
@@ -64,12 +67,12 @@ class InputDataPreprocessorDL(object):
self.buffer_size = buffer_size
self.normalizing_const = normalizing_const
self.num_classes = num_classes
- self.distribution_rules = distribution_rules if distribution_rules else 'all_segments'
+ self.distribution_rules = distribution_rules.lower() if distribution_rules else DistributionRulesOptions.ALL_SEGMENTS
self.module_name = module_name
self.output_summary_table = None
self.dependent_vartype = None
self.independent_vartype = None
- self.gpu_config = '$__madlib__$all_segments$__madlib__$'
+ self.gpu_config = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS)
if self.output_table:
self.output_summary_table = add_postfix(self.output_table, "_summary")
@@ -269,7 +272,7 @@ class InputDataPreprocessorDL(object):
if is_platform_pg():
# used later for writing summary table
- self.distribution_rules = '$__madlib__$all_segments$__madlib__$'
+ self.distribution_rules = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS)
#
# For postgres, we just need 3 simple queries:
@@ -320,14 +323,14 @@ class InputDataPreprocessorDL(object):
# it's to be spread evenly across all segments, we still
# need to do some extra work to ensure that happens.
- if self.distribution_rules == 'all_segments':
+ if self.distribution_rules == DistributionRulesOptions.ALL_SEGMENTS:
all_segments = True
- self.distribution_rules = '$__madlib__$all_segments$__madlib__$'
+ self.distribution_rules = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS)
num_segments = get_seg_number()
else:
all_segments = False
- if self.distribution_rules == 'gpu_segments':
+ if self.distribution_rules == DistributionRulesOptions.GPU_SEGMENTS:
#TODO can we reuse the function `get_accessible_gpus_for_seg` from
# madlib_keras_helper
gpu_info_table = unique_string(desp='gpu_info')
@@ -620,7 +623,7 @@ class InputDataPreprocessorDL(object):
normalizing_const_colname=NORMALIZING_CONST_COLNAME,
num_classes_colname=NUM_CLASSES_COLNAME,
internal_gpu_config=INTERNAL_GPU_CONFIG,
- distribution_rules=DISTRIBUTION_RULES,
+ distribution_rules=DISTRIBUTION_RULES_COLNAME,
FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE)
plpy.execute(query)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
index 0df6772..dc8c837 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
@@ -33,7 +33,7 @@ from utilities.utilities import get_current_timestamp, get_seg_number, get_segme
from utilities.control import SetGUC
from madlib_keras_fit_multiple_model import FitMultipleModel
from madlib_keras_helper import generate_row_string
-from madlib_keras_helper import DISTRIBUTION_RULES
+from madlib_keras_helper import DISTRIBUTION_RULES_COLNAME
from madlib_keras_model_selection import MstSearch, ModelSelectionSchema
from keras_model_arch_table import ModelArchSchema
from utilities.validate_args import table_exists, drop_tables, input_tbl_valid
@@ -706,7 +706,7 @@ class AutoMLHyperopt(KerasAutoML):
:return:
"""
source_summary_table = add_postfix(self.source_table, '_summary')
- dist_rules = plpy.execute("SELECT {0} from {1}".format(DISTRIBUTION_RULES, source_summary_table))[0][DISTRIBUTION_RULES]
+ dist_rules = plpy.execute("SELECT {0} from {1}".format(DISTRIBUTION_RULES_COLNAME, source_summary_table))[0][DISTRIBUTION_RULES_COLNAME]
#TODO create constant for all_segments
if dist_rules == "all_segments":
return get_seg_number()
@@ -734,9 +734,9 @@ class AutoMLHyperopt(KerasAutoML):
self.algorithm = rand
elif automl_params_dict[AutoMLConstants.ALGORITHM].lower() == 'tpe':
self.algorithm = tpe
+ # TODO: Add support for atpe uncomment the below lines after atpe works
# elif automl_params_dict[AutoMLSchema.ALGORITHM].lower() == 'atpe':
# self.algorithm = atpe
- # uncomment the above lines after atpe works # TODO
else:
plpy.error("{0}: valid algorithm 'automl_params' for hyperopt: 'rand', 'tpe'".format(self.module_name)) # , or 'atpe'
else:
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
index 96c2817..be9a1f9 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
@@ -26,7 +26,6 @@ from utilities.validate_args import table_exists
from madlib_keras_gpu_info import GPUInfoFunctions
import plpy
from math import isnan
-# from madlib_keras_model_selection import ModelSelectionSchema
############### Constants used in other deep learning files #########
# Name of columns in model summary table.
@@ -54,7 +53,7 @@ SMALLINT_SQL_TYPE = 'SMALLINT'
DEFAULT_NORMALIZING_CONST = 1.0
GP_SEGMENT_ID_COLNAME = "gp_segment_id"
INTERNAL_GPU_CONFIG = '__internal_gpu_config__'
-DISTRIBUTION_RULES = "distribution_rules"
+DISTRIBUTION_RULES_COLNAME = "distribution_rules"
#####################################################################
# Prepend a dimension to np arrays using expand_dims.
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index 8b2157d..41e4c72 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
@@ -18,6 +18,7 @@
# under the License.
import plpy
+from input_data_preprocessor import DistributionRulesOptions
from keras_model_arch_table import ModelArchSchema
from model_arch_info import get_num_classes
from madlib_keras_custom_function import CustomFunctionSchema
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index e69bab4..13bbfd1 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -51,7 +51,8 @@ class MadlibKerasFitTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -691,7 +692,8 @@ class InternalKerasPredictTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -795,7 +797,8 @@ class MadlibKerasPredictBYOMTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -877,7 +880,8 @@ class MadlibKerasWrapperTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -1210,7 +1214,8 @@ class MadlibKerasFitCommonValidatorTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -1262,7 +1267,8 @@ class InputValidatorTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -1382,7 +1388,8 @@ class MadlibSerializerTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -1585,7 +1592,8 @@ class MadlibKerasEvaluationTestCase(unittest.TestCase):
def setUp(self):
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
index edb12c4..946dde3 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in
@@ -37,7 +37,8 @@ class HyperbandScheduleTestCase(unittest.TestCase):
# tested here. They are tested in dev-check.
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()
@@ -206,15 +207,14 @@ class HyperbandScheduleTestCase(unittest.TestCase):
def tearDown(self):
self.module_patcher.stop()
-
-
class AutoMLHyperoptTestCase(unittest.TestCase):
def setUp(self):
# The side effects of this class(writing to the output table) are not
# tested here. They are tested in dev-check.
self.plpy_mock = Mock(spec='error')
patches = {
- 'plpy': plpy
+ 'plpy': plpy,
+ 'utilities.mean_std_dev_calculator': Mock()
}
self.plpy_mock_execute = MagicMock()