You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2019/04/01 17:36:52 UTC

[madlib] branch master updated: DL: Remove eval statement

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
     new 137ba49  DL: Remove eval statement
137ba49 is described below

commit 137ba49faf62db1ee83edbe6122e2ce3428b78eb
Author: Orhan Kislal <ok...@pivotal.io>
AuthorDate: Mon Apr 1 10:36:08 2019 -0700

    DL: Remove eval statement
    
    JIRA: MADLIB-1309
    
    The eval statement used for parsing compile_params creates a security
    risk. This commit replaces it by parsing the optimizer name and its
    parameters.
    
    Closes #359
    
    Co-authored-by: Domino Valdano <dv...@pivotal.io>
---
 .../modules/deep_learning/madlib_keras.py_in       | 25 ++++--
 .../deep_learning/madlib_keras_wrapper.py_in       | 74 ++++++++++++++++--
 .../modules/deep_learning/test/madlib_keras.sql_in | 91 +++++++++++-----------
 .../test/unit_tests/test_madlib_keras.py_in        |  4 +-
 4 files changed, 135 insertions(+), 59 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 437211d..32cd921 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -43,7 +43,6 @@ from utilities.utilities import add_postfix
 from utilities.utilities import is_var_valid
 from utilities.utilities import madlib_version
 
-
 def _validate_input_table(source_table, independent_varname,
                           dependent_varname):
     _assert(is_var_valid(source_table, independent_varname),
@@ -177,6 +176,8 @@ def fit(schema_madlib, source_table, model, dependent_varname,
             validation_table, dependent_varname, independent_varname,
             input_shape, num_classes)
 
+    optimizers = get_optimizers()
+
     # Compute total buffers on each segment
     total_buffers_per_seg = plpy.execute(
         """ SELECT gp_segment_id, count(*) AS total_buffers_per_seg
@@ -241,8 +242,10 @@ def fit(schema_madlib, source_table, model, dependent_varname,
         if validation_set_provided:
             _, _, _, updated_weights = KerasWeightsSerializer.deserialize_weights(model_state, model_shapes)
             master_model.set_weights(updated_weights)
-            compile_params_args = convert_string_of_args_to_dict(compile_params)
-            master_model.compile(**compile_params_args)
+            (opt_name,final_args,compile_dict) = parse_compile_params(compile_params)
+            master_model.compile(optimizer=optimizers[opt_name](**final_args),
+                                 loss=compile_dict['loss'],
+                                 metrics=compile_dict['metrics'])
             evaluate_result = master_model.evaluate(x_validation, y_validation)
             if len(evaluate_result) < 2:
                 plpy.error('Calling evaluate on validation data returned < 2 '
@@ -396,7 +399,7 @@ def fit_transition(state, ind_var, dep_var, current_seg_id, num_classes,
     # Fit segment model on data
     start_fit = time.time()
     with K.tf.device(device_name):
-        fit_params = convert_string_of_args_to_dict(fit_params)
+        fit_params = parse_fit_params(fit_params)
         history = segment_model.fit(x_train, y_train, **fit_params)
         loss = history.history['loss'][0]
         accuracy = history.history['acc'][0]
@@ -502,9 +505,12 @@ def evaluate(schema_madlib, model_table, source_table, id_col,
     _, updated_weights = KerasWeightsSerializer.deserialize_weights(
         model_data, model_shapes)
     model.set_weights(updated_weights)
-    compile_params_args = convert_string_of_args_to_dict(compile_params)
+    optimizers = get_optimizers()
+    (opt_name,final_args,compile_dict) = parse_compile_params(compile_params)
     with K.tf.device(device_name):
-        model.compile(**compile_params_args)
+        model.compile(optimizer=optimizers[opt_name](**final_args),
+                      loss=compile_dict['loss'],
+                      metrics=compile_dict['metrics'])
 
     input_shape = map(int, input_shape)
     x_validation,  y_validation = get_data_as_np_array(source_table,
@@ -557,7 +563,6 @@ def evaluate1(schema_madlib, model_table, test_table, id_col, model_arch_table,
 
 def internal_keras_evaluate(x_test, y_test, model_arch, model_data, input_shape,
                            compile_params):
-    compile_params = convert_string_of_args_to_dict(compile_params)
     device_name = '/cpu:0'
     os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
 
@@ -570,8 +575,12 @@ def internal_keras_evaluate(x_test, y_test, model_arch, model_data, input_shape,
     _, model_weights = KerasWeightsSerializer.deserialize_weights(
         model_data, model_shapes)
     model.set_weights(model_weights)
+    optimizers = get_optimizers()
+    (opt_name,final_args,compile_dict) = parse_compile_params(compile_params)
     with K.tf.device(device_name):
-        model.compile(**compile_params)
+        model.compile(optimizer=optimizers[opt_name](**final_args),
+                      loss=compile_dict['loss'],
+                      metrics=compile_dict['metrics'])
 
     x_test = np.array(x_test).reshape(len(x_test), input_shape[0], input_shape[1],
                                       input_shape[2])
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
index 63d7e86..6f4706b 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
@@ -21,11 +21,16 @@ import numpy as np
 import os
 import plpy
 
+import ast
+
 from keras import backend as K
 from keras import utils as keras_utils
 from keras.optimizers import *
 
+import keras.optimizers as opt
+
 from madlib_keras_helper import KerasWeightsSerializer
+from utilities.utilities import _assert
 
 #######################################################################
 ########### Keras specific functions #####
@@ -56,16 +61,21 @@ def clear_keras_session():
 def compile_and_set_weights(segment_model, compile_params, device_name,
                             previous_state, model_shapes):
     with K.tf.device(device_name):
-        compile_params = convert_string_of_args_to_dict(compile_params)
-        segment_model.compile(**compile_params)
+
+        optimizers = get_optimizers()
+        (opt_name,final_args,compile_dict) = parse_compile_params(compile_params)
+
+        segment_model.compile(optimizer=optimizers[opt_name](**final_args),
+                              loss=compile_dict['loss'],
+                              metrics=compile_dict['metrics'])
         _, _, _, model_weights = KerasWeightsSerializer.deserialize_weights(
             previous_state, model_shapes)
         segment_model.set_weights(model_weights)
 
-
 """
 Used to convert compile_params and fit_params to actual argument dictionaries
 """
+
 def convert_string_of_args_to_dict(str_of_args):
     """Uses parenthases matching algorithm to intelligently convert
     a string with valid python code into an argument dictionary"""
@@ -76,6 +86,9 @@ def convert_string_of_args_to_dict(str_of_args):
         '{' : '}',
     }
     result_str = ""
+    key_str = ""
+    value_str = ""
+    compile_dict = {}
     for char in str_of_args:
         if char in dual.keys():
             stack.append(char)
@@ -85,7 +98,58 @@ def convert_string_of_args_to_dict(str_of_args):
                 stack.pop(-1)
             result_str += char
         elif not stack and char == "=":
-            result_str += ":"
+            key_str = result_str
+            result_str = ""
+        elif not stack and char == ",":
+            value_str = result_str
+            result_str = ""
+            compile_dict[key_str.strip()]=value_str.strip('\'')
         else:
             result_str += char
-    return eval('{' + result_str + '}')
+    value_str = result_str
+    result_str = ""
+    compile_dict[key_str.strip()]=value_str.strip('\'')
+    return compile_dict
+
+# Parse the compile parameters and the optimizer.
+# Optimizer name and its arguments are returned in addition to the rest of the
+# compile parameters.
+def parse_compile_params(str_of_args):
+
+    compile_dict = convert_string_of_args_to_dict(str_of_args)
+    compile_dict['metrics'] = ast.literal_eval(compile_dict['metrics']) if 'metrics' in compile_dict.keys() else None
+    compile_dict['loss_weights'] = ast.literal_eval(compile_dict['loss_weights']) if 'loss_weights' in compile_dict.keys() else None
+
+    opt_name = compile_dict['optimizer'].split('(')[0]
+    optimizers = get_optimizers()
+    _assert(opt_name in optimizers,
+            "model_keras error: invalid optimizer name: {0}".format(opt_name))
+    opt_params = compile_dict['optimizer'].split('(')[1][:-1]
+    opt_params_array = opt_params.split(',')
+    opt_params_clean = map(split_and_strip, opt_params_array)
+    key_value_params = { x[0] : x[1] for x in opt_params_clean}
+    final_args = { key: bool(value) if value == 'True' or value == 'False' else  float(value) for key,value in key_value_params.iteritems() }
+
+    return (opt_name,final_args,compile_dict)
+
+# Parse the fit parameters into a dictionary.
+def parse_fit_params(str_of_args):
+    compile_dict = convert_string_of_args_to_dict(str_of_args)
+    for key in compile_dict.keys():
+        compile_dict[key] = ast.literal_eval(compile_dict[key])
+    return compile_dict
+
+# Split and strip the whispace of key=value formatted strings
+def split_and_strip(x):
+    y = x.split('=')
+    return (y[0].strip(),y[1].strip())
+
+# Return the list of keras optimizers
+def get_optimizers():
+    optimizers = dict()
+    names = dir(opt)
+    for n in names:
+        optimizer = eval('opt.' + n)
+        if optimizer.__class__ == type and optimizer.__base__ == opt.Optimizer:
+            optimizers[n] = optimizer
+    return optimizers
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index f69bca5..fbf6a81 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -58,6 +58,9 @@ SELECT load_keras_model('model_arch',
 	}], "backend": "tensorflow"}$$);
 ALTER TABLE model_arch RENAME model_id TO id;
 
+-- Please do not break up the compile_params string
+-- It might break the assertion
+
 DROP TABLE IF EXISTS keras_out, keras_out_summary;
 SELECT madlib_keras_fit('cifar_10_sample_batched',
               'keras_out',
@@ -65,8 +68,8 @@ SELECT madlib_keras_fit('cifar_10_sample_batched',
               'independent_var',
               'model_arch',
               1,
-              '''optimizer''=SGD(lr=0.01, decay=1e-6, nesterov=True), ''loss''=''categorical_crossentropy'', ''metrics''=[''accuracy'']'::text,
-              '''batch_size''=2, ''epochs''=1, ''verbose''=0'::text,
+              $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text,
+              $$ batch_size=2, epochs=1, verbose=0 $$::text,
               3,
               10,
               FALSE,
@@ -86,8 +89,8 @@ SELECT assert(
         description is NULL AND
         model_size > 0 AND
         madlib_version is NOT NULL AND
-        compile_params = '''optimizer''=SGD(lr=0.01, decay=1e-6, nesterov=True), ''loss''=''categorical_crossentropy'', ''metrics''=[''accuracy'']' AND
-        fit_params = '''batch_size''=2, ''epochs''=1, ''verbose''=0' AND
+        compile_params = $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text AND
+        fit_params = $$ batch_size=2, epochs=1, verbose=0 $$::text AND
         num_iterations = 3 AND
         num_classes = 10 AND
         accuracy is not NULL AND
@@ -110,47 +113,47 @@ SELECT assert(model_data is not NULL , 'Keras model output validation failed') f
 	-- Null validation table
 DROP TABLE IF EXISTS keras_out, keras_out_summary;
 SELECT madlib_keras_fit('cifar_10_sample_batched',
-												'keras_out',
-												'dependent_var',
-												'independent_var',
-												'model_arch',
-												1,
-												'''optimizer''=SGD(lr=0.01, decay=1e-6, nesterov=True), ''loss''=''categorical_crossentropy'', ''metrics''=[''accuracy'']'::text,
-												'''batch_size''=2, ''epochs''=1, ''verbose''=0'::text,
-												1,
-												10,
-												FALSE,
-												NULL,
-  											'model name', 'model desc');
+						'keras_out',
+						'dependent_var',
+						'independent_var',
+						'model_arch',
+						1,
+                        $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text,
+                        $$ batch_size=2, epochs=1, verbose=0 $$::text,
+						1,
+						10,
+						FALSE,
+						NULL,
+						'model name', 'model desc');
 SELECT assert(
-							 model_arch_table = 'model_arch' AND
-							 model_arch_id = 1 AND
-							 model_type = 'madlib_keras' AND
-							 start_training_time         < now() AND
-							 end_training_time > start_training_time AND
-							 source_table = 'cifar_10_sample_batched' AND
-							 validation_table = 'cifar_10_sample_batched' AND
-							 model = 'keras_out' AND
-							 dependent_varname = 'dependent_var' AND
-							 independent_varname = 'independent_var' AND
-							 name = 'model name' AND
-							 description = 'model desc' AND
-							 model_size > 0 AND
-							 madlib_version is NOT NULL AND
-							 compile_params = '''optimizer''=SGD(lr=0.01, decay=1e-6, nesterov=True), ''loss''=''categorical_crossentropy'', ''metrics''=[''accuracy'']' AND
-							 fit_params = '''batch_size''=2, ''epochs''=1, ''verbose''=0' AND
-							 num_iterations = 1 AND
-							 num_classes = 10 AND
-							 accuracy is not NULL AND
-							 loss is not NULL AND
-							 array_upper(accuracy_iter, 1) = 1 AND
-							 array_upper(loss_iter, 1) = 1 AND
-							 array_upper(time_iter, 1) = 1 AND
-							 accuracy_validation is  NULL AND
-							 loss_validation is  NULL AND
-							 array_upper(accuracy_iter_validation,1) = 0 AND
-							 array_upper(loss_iter_validation,1) = 0 ,
-							 'Keras model output Summary Validation failed. Actual:' || __to_char(summary))
+        model_arch_table = 'model_arch' AND
+        model_arch_id = 1 AND
+        model_type = 'madlib_keras' AND
+        start_training_time         < now() AND
+        end_training_time > start_training_time AND
+        source_table = 'cifar_10_sample_batched' AND
+        validation_table = 'cifar_10_sample_batched' AND
+        model = 'keras_out' AND
+        dependent_varname = 'dependent_var' AND
+        independent_varname = 'independent_var' AND
+        name = 'model name' AND
+        description = 'model desc' AND
+        model_size > 0 AND
+        madlib_version is NOT NULL AND
+        compile_params = $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']$$::text AND
+        fit_params = $$ batch_size=2, epochs=1, verbose=0 $$::text AND
+        num_iterations = 1 AND
+        num_classes = 10 AND
+        accuracy is not NULL AND
+        loss is not NULL AND
+        array_upper(accuracy_iter, 1) = 1 AND
+        array_upper(loss_iter, 1) = 1 AND
+        array_upper(time_iter, 1) = 1 AND
+        accuracy_validation is  NULL AND
+        loss_validation is  NULL AND
+        array_upper(accuracy_iter_validation,1) = 0 AND
+        array_upper(loss_iter_validation,1) = 0 ,
+        'Keras model output Summary Validation failed. Actual:' || __to_char(summary))
 from (select * from keras_out_summary) summary;
 
 SELECT assert(model_data is not NULL , 'Keras model output validation failed') from (select * from keras_out) k;
diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index fe6a1b8..d9613ca 100644
--- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -55,8 +55,8 @@ class MadlibKerasFitTestCase(unittest.TestCase):
                          input_shape=(1,1,1,), padding='same'))
         self.model.add(Flatten())
 
-        self.compile_params = "'optimizer'=SGD(lr=0.01, decay=1e-6, nesterov=True), 'loss'='categorical_crossentropy', 'metrics'=['accuracy']"
-        self.fit_params = "'batch_size'=1, 'epochs'=1"
+        self.compile_params = "optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy']"
+        self.fit_params = "batch_size=1, epochs=1"
         self.model_weights = [3,4,5,6]
         self.model_shapes = []
         for a in self.model.get_weights():