You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2022/11/21 11:43:20 UTC

[madlib] branch master updated: XGBoost: Various fixes

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


The following commit(s) were added to refs/heads/master by this push:
     new ee7c919d XGBoost: Various fixes
ee7c919d is described below

commit ee7c919d256322d66a3112a862d6123f713d7a45
Author: Orhan Kislal <ok...@apache.org>
AuthorDate: Thu Oct 13 15:06:45 2022 -0400

    XGBoost: Various fixes
    
    - Fix class label verification in predict
    
    Class label is an optional argument for predict but the code checks to
    ensure the column exists in the table. This commit fixes the issue and
    adds a test to run predict with default values and no optional
    parameters.
    
    - Add support for bigint and varchar id col
    
    XGBoost supports non-integer values as id columns (not features) in the
    python implementation. This commit alters the surrounding code to
    accomodate for such column types and adds/alters tests accordingly.
    
    - Add eval_metrics as a parameter
    
    eval_metrics is used by XGBoost used monitoring the training result and
    early stopping. We expose this parameter to the user and parse it to
    pass it to the fit function (instead of init).
---
 .../postgres/modules/mxgboost/madlib_xgboost.py_in | 30 ++++++---
 .../modules/mxgboost/madlib_xgboost.sql_in         | 11 +++-
 .../modules/mxgboost/test/madlib_xgboost.sql_in    | 73 +++++++++++++++++++---
 3 files changed, 94 insertions(+), 20 deletions(-)

diff --git a/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in b/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
index bd4d5d28..ce2a8e4f 100644
--- a/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
+++ b/src/ports/postgres/modules/mxgboost/madlib_xgboost.py_in
@@ -40,6 +40,7 @@ from utilities.utilities import _assert
 from utilities.utilities import add_postfix
 from utilities.utilities import unique_string
 from utilities.validate_args import get_cols
+from utilities.validate_args import get_expr_type
 from utilities.validate_args import input_tbl_valid
 from utilities.validate_args import output_tbl_valid
 from utilities.validate_args import cols_in_tbl_valid
@@ -106,6 +107,13 @@ def expand_grid(params):
     params_grid = [l for l in itertools.product(*params_list)]
     return params_grid
 
+def try_literal_eval(t):
+    try:
+        ret = ast.literal_eval(t)
+    except Exception:
+        ret = t
+    return ret
+
 def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
                   class_weights, train_set_size, id_column, train_set_split_var):
     """
@@ -137,6 +145,7 @@ def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
         X_test = X[numpy.array(df[train_set_split_var]==0),]
         y_train = y[numpy.array(df[train_set_split_var]==1)]
         y_test = y[numpy.array(df[train_set_split_var]==0)]
+
     #save off and remove the id_column for later output. Make sure to get rid of id_column from features!
     test_ids = X_test [:,len(features)-1]
     X_train = numpy.delete(X_train,len(features)-1,1)
@@ -167,14 +176,14 @@ def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
 
     #Train gradient boosted trees
     p_list = [p.split('=') for p in ast.literal_eval(re.sub("[\\t]","",params).strip())]
-    params_dict = dict([(k, ast.literal_eval(v.strip())) for k,v in p_list])
+    params_dict = dict([(k, try_literal_eval(v.strip())) for k,v in p_list])
+    eval_metric = params_dict.pop('eval_metric') if 'eval_metric' in params_dict else 'auc'
     gbm = xgb.XGBClassifier(**params_dict)
-
     #Fit model
     gbm.fit(
         X_train,
         y_train,
-        eval_metric = 'auc',
+        eval_metric = eval_metric,
         sample_weight = sample_weights
     )
     #Compute and return model metrics score
@@ -199,7 +208,6 @@ def xgboost_train(schema_madlib, dframe, features_all, class_label, params,
     fnames, f_importance_scores = zip(*fnames_importances)
     important_features = pd.DataFrame(fnames_importances)
 
-    test_ids = [int(x) for x in test_ids]
     return (features, pickle.dumps(gbm), params, fnames, f_importance_scores,
         model_metrics.iloc[:,1].values.tolist(), model_metrics.iloc[:,2].values.tolist(),
         model_metrics.iloc[:,3].values.tolist(),model_metrics.iloc[:,4].values.tolist(),
@@ -240,7 +248,8 @@ def xgboost_grid_search(schema_madlib, source_table, id_column, class_label,
         {
             'learning_rate': [0.3], #Regularization on weights (eta). For smaller values, increase n_estimators
             'max_depth': [6],#Larger values could lead to overfitting
-            'n_estimators':[100] #More estimators, lesser variance (better fit on test set)
+            'n_estimators':[100], #More estimators, lesser variance (better fit on test set)
+            'eval_metric':['auc']
         }
         """
 
@@ -339,7 +348,7 @@ def xgboost_grid_search(schema_madlib, source_table, id_column, class_label,
                 (mdl_results).recall,
                 (mdl_results).fscore,
                 (mdl_results).support,
-                (mdl_results).test_ids::INTEGER[],
+                (mdl_results).test_ids,
                 params_index
             FROM
                 {grid_search_results_temp_tbl}
@@ -375,7 +384,9 @@ def xgboost_predict(schema_madlib, scoring_tbl, mdl_table, mdl_output_tbl,
     """
 
     input_tbl_valid(scoring_tbl, 'XGBoost')
-    cols_in_tbl_valid(scoring_tbl, [id_column, class_label], 'XGBoost')
+    cols_in_tbl_valid(scoring_tbl, [id_column], 'XGBoost')
+    if class_label:
+        cols_in_tbl_valid(scoring_tbl, [class_label], 'XGBoost')
     input_tbl_valid(mdl_table, 'XGBoost')
     output_tbl_valid(mdl_output_tbl, 'XGBoost')
     mdl_output_tbl_metrics = add_postfix(mdl_output_tbl, '_metrics')
@@ -383,6 +394,7 @@ def xgboost_predict(schema_madlib, scoring_tbl, mdl_table, mdl_output_tbl,
     output_tbl_valid(mdl_output_tbl_metrics, 'XGBoost')
     output_tbl_valid(mdl_output_tbl_roc_curve, 'XGBoost')
 
+    id_type = get_expr_type(id_column, scoring_tbl)
     #Load the serialized XGBoost model from the table
     mdl_sql = """
         SELECT
@@ -399,14 +411,12 @@ def xgboost_predict(schema_madlib, scoring_tbl, mdl_table, mdl_output_tbl,
     gbm = pickle.loads(model)
 
     #Fetch features from test dataset for scoring
-    plpy.info(features)
     if isinstance(features, list):
         features_str = ','.join(features)
     else:
         features_str = features
         features = [features]
     comma_class_label = ', {0}'.format(class_label) if class_label else ''
-    plpy.info(features_str)
     mdl_score_sql = """
         SELECT
             {id_column},
@@ -499,7 +509,7 @@ def xgboost_predict(schema_madlib, scoring_tbl, mdl_table, mdl_output_tbl,
     sql = """
         CREATE TABLE {mdl_output_tbl}
         (
-            {id_column} INTEGER,
+            {id_column} {id_type},
             {predicted_class_label} TEXT,
             {predicted_class_proba_label} FLOAT8[]
         )
diff --git a/src/ports/postgres/modules/mxgboost/madlib_xgboost.sql_in b/src/ports/postgres/modules/mxgboost/madlib_xgboost.sql_in
index 83be72dc..6f168907 100644
--- a/src/ports/postgres/modules/mxgboost/madlib_xgboost.sql_in
+++ b/src/ports/postgres/modules/mxgboost/madlib_xgboost.sql_in
@@ -96,7 +96,7 @@ SELECT xgboost(
   <DT>id_column</DT>
   <DD>TEXT. Name of the column containing id information in the training data.
   This is a mandatory argument and the values are expected to be unique for each
-  row.
+  row. Suggested column types are INTEGER, BIGINT, and VARCHAR.
   </DD>
 
   <DT>dependent_variable</DT>
@@ -124,10 +124,17 @@ SELECT xgboost(
   accepts any parameter thanks to kwargs. If there is a typo in the list, it
   might get ignored by xgboost.
 
+  eval_metric is a unique key for this dictionary. In XGBoost v0.82, this
+  parameter is passed to the fit function and not the initializer like the
+  rest. After v1.6.0, it was moved to the initializer. MADlib interface keeps
+  these parameters together to ensure that when we upgrade the supported version
+  of the XGBoost, we will be able to maintain same the interface.
+
   Default values set by MADlib:
   learning_rate: 0.3
   max_depth: 6
   n_estimators: 100
+  eval_metric: 'auc'
 
   </DD>
 
@@ -630,7 +637,7 @@ AS
     recall TEXT[],
     fscore TEXT[],
     support TEXT[],
-    test_ids INTEGER[]
+    test_ids VARCHAR[]
 );
 
 DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__xgboost_train_parallel__(
diff --git a/src/ports/postgres/modules/mxgboost/test/madlib_xgboost.sql_in b/src/ports/postgres/modules/mxgboost/test/madlib_xgboost.sql_in
index 345129e1..834a1b05 100644
--- a/src/ports/postgres/modules/mxgboost/test/madlib_xgboost.sql_in
+++ b/src/ports/postgres/modules/mxgboost/test/madlib_xgboost.sql_in
@@ -22,7 +22,7 @@
 
 DROP TABLE IF EXISTS abalone;
 CREATE TABLE abalone (
-        id integer,
+        id bigint,
         sex char(1),
         length float,
         diameter float,
@@ -4212,7 +4212,7 @@ INSERT INTO abalone VALUES
 (4174,'M',0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10),
 (4175,'M',0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9),
 (4176,'F',0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10),
-(4177,'M',0.71,0.555,0.195,1.9485,0.9455,0.3765,0.495,12);
+(99999999994177,'M',0.71,0.555,0.195,1.9485,0.9455,0.3765,0.495,12);
 
 SELECT xgboost(
     'abalone',  -- Training table
@@ -4268,12 +4268,44 @@ SELECT xgboost(
 );
 
 SELECT xgboost_predict(
-    'abalone',          -- test_table
-    'xgb_grid_out',          -- model_table
-    'xgb_grid_score_out',    -- predict_output_table
-    'id',               -- id_column
-    'sex',              -- class_label
-    2                   -- model_filters
+    'abalone',                  -- test_table
+    'xgb_grid_out',             -- model_table
+    'xgb_grid_score_out',       -- predict_output_table
+    'id',                       -- id_column
+    'sex',                      -- class_label
+    2                           -- model_filters
+);
+
+SELECT xgboost(
+    'abalone',  -- Training table
+    'xgb_grid_eval_out',  -- Grid search results table.
+    'id',       -- Id column
+    'sex',      -- Class label column
+    '*',        -- Independent variables
+    NULL,       -- Columns to exclude from features
+    $$
+    {
+        'learning_rate': [0.01], #Regularization on weights (eta). For smaller values, increase n_estimators
+        'max_depth': [9],#Larger values could lead to overfitting
+        'subsample': [0.85],#introduce randomness in samples picked to prevent overfitting
+        'colsample_bytree': [0.85],#introduce randomness in features picked to prevent overfitting
+        'min_child_weight': [10],#larger values will prevent over-fitting
+        'n_estimators':[100], #More estimators, lesser variance (better fit on test set)
+        'eval_metric':['auc', 'logloss']
+    }
+    $$,         -- XGBoost grid search parameters
+    '',         -- Class weights
+    0.8,        -- Training set size ratio
+    NULL        -- Variable used to do the test/train split.
+);
+
+SELECT xgboost_predict(
+    'abalone',                  -- test_table
+    'xgb_grid_eval_out',        -- model_table
+    'xgb_grid_eval_score_out',  -- predict_output_table
+    'id',                       -- id_column
+    'sex',                      -- class_label
+    2
 );
 
 SELECT xgboost(
@@ -4283,3 +4315,28 @@ SELECT xgboost(
     'sex',
     '*'
 );
+
+SELECT xgboost_predict(
+    'abalone',                  -- test_table
+    'xgb_default_out',          -- model_table
+    'xgb_default_score_out',    -- predict_output_table
+    'id'                        -- id_column
+);
+
+CREATE TABLE abalone_text AS SELECT (id::varchar || ' test')::VARCHAR AS id_text, * FROM abalone;
+
+SELECT xgboost(
+    'abalone_text',
+    'xgb_text_out',
+    'id_text',
+    'sex',
+    '*',
+    ARRAY['id']
+);
+
+SELECT xgboost_predict(
+    'abalone_text',          -- test_table
+    'xgb_text_out',          -- model_table
+    'xgb_text_score_out',    -- predict_output_table
+    'id_text'                -- id_column
+);