You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nk...@apache.org on 2024/03/02 00:22:54 UTC

(madlib) branch madlib2-master updated (22dd9986 -> 5b6f0033)

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a change to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git


    from 22dd9986 PMML: Fix segfault in postgres dev-check
     new c9987434 PMML: Improve multinom dev-check tests
     new 43775de2 PMML: Improve ordinal dev-check tests
     new 12fb8887 PMML: Improve dev-check tests for decision tree
     new fa57c4fb PMML: Improve dev-check tests for random forest
     new 769f758e PMML: Improve namespec dev-check tests
     new 0c1cd4ff PMML: Add tests for intercept acting as a predictor
     new 0b75a0af PMML: Separate out datasets and setup functions
     new b944045e PMML: Consider spaces when parsing the indep var
     new 5b6f0033 Update README to point to madlib2-master build

The 9 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 README.md                                          |   2 +-
 src/ports/postgres/modules/pmml/formula.py_in      |  16 +-
 .../test/pmml.setup.datasets.sql_in}               | 251 +++++----------------
 .../postgres/modules/pmml/test/pmml.setup.sql_in   | 228 -------------------
 .../modules/pmml/test/pmml_check_fields.sql_in     |   5 +-
 .../postgres/modules/pmml/test/pmml_dt.sql_in      | 117 +++++++++-
 .../modules/pmml/test/pmml_glm_binomial.sql_in     |   3 +
 .../modules/pmml/test/pmml_glm_gamma.sql_in        |   3 +
 .../postgres/modules/pmml/test/pmml_glm_ig.sql_in  |   3 +
 .../modules/pmml/test/pmml_glm_normal.sql_in       |   3 +
 .../modules/pmml/test/pmml_glm_poisson.sql_in      |   3 +
 .../pmml/test/pmml_glm_with_grouping.sql_in        |   3 +
 .../pmml/test/pmml_glm_with_name_spec.sql_in       |  12 +
 .../pmml/test/pmml_intercept_as_predictor.sql_in   |  87 +++++++
 .../postgres/modules/pmml/test/pmml_linear.sql_in  |   3 +
 .../modules/pmml/test/pmml_logistic.sql_in         |   3 +
 .../modules/pmml/test/pmml_multinom.sql_in         | 107 +++++++--
 .../postgres/modules/pmml/test/pmml_ordinal.sql_in |  54 +++--
 .../postgres/modules/pmml/test/pmml_rf.sql_in      |  72 +++++-
 .../test/pmml_with_non_array_expression.sql_in     |   3 +
 .../pmml/test/unit_tests/test_formula.py_in        | 177 +++++++++++++++
 21 files changed, 691 insertions(+), 464 deletions(-)
 copy src/ports/postgres/modules/{glm/test/glm.ic.sql_in => pmml/test/pmml.setup.datasets.sql_in} (74%)
 create mode 100644 src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in


(madlib) 04/09: PMML: Improve dev-check tests for random forest

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit fa57c4fb40ce71f104449ec8356a3e0d53f97426
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Tue Feb 20 17:10:46 2024 -0800

    PMML: Improve dev-check tests for random forest
    
    JIRA: MADLIB-1517
    
    This commit adds a few more random forest pmml tests that compare
    forest_predict's output with pypmml's output
---
 .../postgres/modules/pmml/test/pmml_rf.sql_in      | 72 ++++++++++++++++++++--
 1 file changed, 68 insertions(+), 4 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in b/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in
index b91db5d4..805b977a 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_rf.sql_in
@@ -1,3 +1,10 @@
+\i m4_regexp(MADLIB_LIBRARY_PATH,
+             `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.sql_in'
+)
+
+m4_changequote(`<!'', `!>'')
+
 DROP TABLE IF EXISTS dt_golf;
 CREATE TABLE dt_golf (
     id integer NOT NULL,
@@ -6,7 +13,7 @@ CREATE TABLE dt_golf (
     humidity double precision,
     windy text,
     class text
-) ;
+);
 
 INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
 (1, 'sunny', 85, 85, 'false', 'Don''t Play'),
@@ -33,7 +40,7 @@ SELECT forest_train('dt_golf'::text,         -- source table
                          'humidity, windy'::text,   -- features
                          NULL::text,        -- exclude columns
                          'class'::text,      -- grouping
-                         5::integer,     -- num_trees
+                         3::integer,     -- num_trees
                          1::integer,        -- num_random_features
                          FALSE,       -- importance
                          1::integer,        -- num_permutations
@@ -43,9 +50,51 @@ SELECT forest_train('dt_golf'::text,         -- source table
                          3::integer        -- number of bins per continuous variable
                          );
 
-SELECT pmml('train_output');
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_temperature::double precision','predicted_(temperature::double precision)_pmml_prediction');
+
 -------------------------------------------------------------------------
 
+-- classification, no grouping
+DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
+SELECT forest_train('dt_golf'::text,         -- source table
+                         'train_output'::text,    -- output model table
+                         'id'::text,              -- id column
+                         '"OUTLOOK"'::text,           -- response
+                         'humidity, windy'::text,   -- features
+                         NULL::text,        -- exclude columns
+                         NULL,      -- grouping
+                         5::integer,     -- num_trees
+                         1::integer,        -- num_random_features
+                         FALSE,       -- importance
+                         1::integer,        -- num_permutations
+                         5::integer,        -- max depth
+                         1::integer,        -- min split
+                         1::integer,         -- min bucket
+                         3::integer        -- number of bins per continuous variable
+                         );
+
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction');
+
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_overcast','probability_overcast');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_rain','probability_rain');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_sunny','probability_sunny');
+
+
 -- classification, grouping
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
 SELECT forest_train('dt_golf'::text,         -- source table
@@ -65,6 +114,21 @@ SELECT forest_train('dt_golf'::text,         -- source table
                          3::integer        -- number of bins per continuous variable
                          );
 
-SELECT pmml('train_output');
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction');
+
+DROP TABLE IF EXISTS forest_predict_output;
+SELECT forest_predict('train_output',
+                   'dt_golf',
+                   'forest_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_overcast','probability_overcast');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_rain','probability_rain');
+SELECT test_pmml_output('dt_golf', 'train_output', 'forest_predict_output','id', 'estimated_prob_sunny','probability_sunny');
+
 -------------------------------------------------------------------------
 


(madlib) 06/09: PMML: Add tests for intercept acting as a predictor

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 0c1cd4ff9387eb563fecacb0a5d4bb2fdc96ce16
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Thu Feb 22 16:05:43 2024 -0800

    PMML: Add tests for intercept acting as a predictor
    
    JIRA: MADLIB-1517
    
    A previous commit 0cd28f9733927d63beaefc9488db7f8bfdb3bd80 made changes to the
    pmml code so that the intercept won't be used as a predictor. But it's still
    possible that this assumption may not be true in some scenarios and the
    intercept might still be treated as a predictor in the pmml.
    For e.g. consider this scenario:
    While using any of the regression algorithms, user passes the independent
    variable as "ARRAY[x1,1,x2] or ARRAY[x1,x2,1]" instead of "ARRAY[1,x1,x2]"
    In this scenario, the pmml code will assume that there isn't a intercept in
    this expression and will treat "1" as a predictor.
    When predicting using this pmml, users will need to create a column/field named
    "1" which has the value 1 for each data row. The test added in this commit
    mimics this scenario
---
 .../pmml/test/pmml_intercept_as_predictor.sql_in   | 84 ++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in b/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in
new file mode 100644
index 00000000..dc0e28bd
--- /dev/null
+++ b/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in
@@ -0,0 +1,84 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+ \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.sql_in'
+)
+
+m4_changequote(`<!'', `!>'')
+
+------------------ This file will test scenarios when the intercept might still be treated as a predictor in the pmml --------------------
+
+-------------- intercept expression "1" not being the first value in the array  -------------------
+
+-- logistic
+DROP TABLE IF EXISTS logregr_model, logregr_model_summary;
+SELECT logregr_train(
+    '"Patients"',
+    'logregr_model',
+    '"Second_attack"',
+    'ARRAY["Treatment", 1, trait_anxiety]',
+    NULL,
+    20,
+    'irls'
+);
+CREATE TABLE patients_with_1 as SELECT 1 as "1", * from "Patients";
+
+DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict(coef, ARRAY["Treatment", 1, trait_anxiety])
+FROM logregr_model, "Patients";
+SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict', 'predicted_Second_attack_pmml_prediction');
+
+DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict_prob(coef, ARRAY["Treatment", 1, trait_anxiety])
+FROM logregr_model, "Patients";
+SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict_prob', 'probability_true');
+
+DROP TABLE IF EXISTS logregr_model, logregr_model_summary;
+SELECT logregr_train(
+    '"Patients"',
+    'logregr_model',
+    '"Second_attack"',
+    'ARRAY["Treatment", trait_anxiety, 1]',
+    NULL,
+    20,
+    'irls'
+);
+
+DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict(coef, ARRAY["Treatment", trait_anxiety, 1])
+FROM logregr_model, "Patients";
+SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict', 'predicted_Second_attack_pmml_prediction');
+
+DROP TABLE IF EXISTS logregr_predict_output; CREATE TABLE logregr_predict_output as SELECT id, logregr_predict_prob(coef, ARRAY["Treatment", trait_anxiety, 1])
+FROM logregr_model, "Patients";
+SELECT test_pmml_output('patients_with_1', 'logregr_model', 'logregr_predict_output', 'id', 'logregr_predict_prob', 'probability_true');
+
+---- glm -----
+DROP TABLE IF EXISTS glm_model, glm_model_summary;
+SELECT glm(
+    'abalone',
+    'glm_model',
+    'rings',
+    'ARRAY[1.0, length, diameter, height, whole, shucked, viscera, shell]',
+    'family=gaussian, link=identity', NULL, 'max_iter=1000, tolerance=1e-16'
+);
+CREATE TABLE abalone_with_1 AS SELECT 1, * FROM abalone;
+DROP TABLE IF EXISTS glm_predict_out; CREATE TABLE glm_predict_out as SELECT id, glm_predict(coef, ARRAY[1, length, diameter, height, whole, shucked, viscera, shell], 'identity')
+FROM glm_model, abalone_with_1;
+SELECT test_pmml_output('abalone_with_1', 'glm_model', 'glm_predict_out', 'id', 'glm_predict', 'predicted_rings_pmml_prediction');


(madlib) 03/09: PMML: Improve dev-check tests for decision tree

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 12fb88877de26f6efd6777f6ac9fa12960e3658e
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Tue Feb 20 16:18:15 2024 -0800

    PMML: Improve dev-check tests for decision tree
    
    JIRA: MADLIB-1517
    
    This commit adds a few more decision tree pmml tests that compare tree_predict's
    output with pypmml's output
---
 .../postgres/modules/pmml/test/pmml_dt.sql_in      | 117 ++++++++++++++++++++-
 1 file changed, 113 insertions(+), 4 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/test/pmml_dt.sql_in b/src/ports/postgres/modules/pmml/test/pmml_dt.sql_in
index c03d6f34..86d97360 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_dt.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_dt.sql_in
@@ -1,3 +1,10 @@
+\i m4_regexp(MADLIB_LIBRARY_PATH,
+             `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.sql_in'
+)
+
+m4_changequote(`<!'', `!>'')
+
 DROP TABLE IF EXISTS dt_golf;
 CREATE TABLE dt_golf (
     id integer NOT NULL,
@@ -24,12 +31,49 @@ INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
 (13, 'overcast', 81, 75, 'false', 'Play'),
 (14, 'rain', 71, 80, 'true', 'Don''t Play');
 
+-- regression, no grouping
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT tree_train('dt_golf'::text,         -- source table
+                         'train_output'::text,    -- output model table
+                         'id'::text,              -- id column
+                         'temperature'::text,           -- response
+                         'humidity, windy'::text,   -- features
+                         NULL::text,        -- exclude columns
+                         'gini'::text,      -- split criterion
+                         NULL::text,     -- no grouping
+                         NULL::text,        -- no weights
+                         10::integer,       -- max depth
+                         3::integer,        -- min split
+                         1::integer,        -- min bucket
+                         3::integer,        -- number of bins per continuous variable
+                         'cp=0.01'          -- cost-complexity pruning parameter
+                         );
+
+SELECT _print_decision_tree(tree) from train_output;
+-- TODO: Enable these lines after the DT tree_display bug is fixed
+-- SELECT tree_display('train_output', False);
+
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_temperature','predicted_temperature_pmml_prediction');
+
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'prob_temperature','predicted_temperature_pmml_prediction');
+
+
 -- regression, grouping
 DROP TABLE IF EXISTS train_output, train_output_summary;
 SELECT tree_train('dt_golf'::text,         -- source table
                          'train_output'::text,    -- output model table
                          'id'::text,              -- id column
-                         'temperature::double precision'::text,           -- response
+                         'temperature'::text,           -- response
                          'humidity, windy'::text,   -- features
                          NULL::text,        -- exclude columns
                          'gini'::text,      -- split criterion
@@ -46,8 +90,58 @@ SELECT _print_decision_tree(tree) from train_output;
 -- TODO: Enable these lines after the DT tree_display bug is fixed
 -- SELECT tree_display('train_output', False);
 
-SELECT pmml('train_output');
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_temperature','predicted_temperature_pmml_prediction');
+
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'prob_temperature','predicted_temperature_pmml_prediction');
+
 -------------------------------------------------------------------------
+-- classification, no grouping
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT tree_train('dt_golf'::text,         -- source table
+                         'train_output'::text,    -- output model table
+                         'id'::text,              -- id column
+                         '"OUTLOOK"'::text,           -- response
+                         'humidity, windy'::text,   -- features
+                         NULL::text,        -- exclude columns
+                         'gini'::text,      -- split criterion
+                         NULL::text,     -- no grouping
+                         NULL::text,        -- no weights
+                         10::integer,       -- max depth
+                         3::integer,        -- min split
+                         1::integer,        -- min bucket
+                         3::integer,        -- number of bins per continuous variable
+                         'cp=0.01'          -- cost-complexity pruning parameter
+                         );
+
+SELECT _print_decision_tree(tree) from train_output;
+-- SELECT tree_display('train_output', False);
+
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction');
+
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_prob_overcast','probability_overcast');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_prob_rain','probability_rain');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_prob_sunny','probability_sunny');
+
 
 -- classification, grouping
 DROP TABLE IF EXISTS train_output, train_output_summary;
@@ -58,7 +152,7 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          'humidity, windy'::text,   -- features
                          NULL::text,        -- exclude columns
                          'gini'::text,      -- split criterion
-                         'class'::text,     -- no grouping
+                         'class'::text,     -- grouping
                          NULL::text,        -- no weights
                          10::integer,       -- max depth
                          3::integer,        -- min split
@@ -70,6 +164,21 @@ SELECT tree_train('dt_golf'::text,         -- source table
 SELECT _print_decision_tree(tree) from train_output;
 -- SELECT tree_display('train_output', False);
 
-SELECT pmml('train_output');
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'response');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_OUTLOOK','predicted_OUTLOOK_pmml_prediction');
+
+DROP TABLE IF EXISTS tree_predict_output;
+SELECT tree_predict('train_output',
+                   'dt_golf',
+                   'tree_predict_output',
+                   'prob');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_prob_overcast','probability_overcast');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_prob_rain','probability_rain');
+SELECT test_pmml_output('dt_golf', 'train_output', 'tree_predict_output','id', 'estimated_prob_sunny','probability_sunny');
+
 -------------------------------------------------------------------------
 


(madlib) 08/09: PMML: Consider spaces when parsing the indep var

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit b944045e624e791b6c41bca4ef5d56ba54d4bb68
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Tue Feb 20 15:06:46 2024 -0800

    PMML: Consider spaces when parsing the indep var
    
    JIRA: MADLIB-1517
    
    A previous commit 0cd28f9733927d63beaefc9488db7f8bfdb3bd80 added support to
    parse the independent var expression to determine if an intercept was used
    during training. This commit improves the regex by adding support for spaces
    and also adds a detailed explanation for the regex
    
    This commit also fixes a warning that would get generated with the previous regex:
    ```
    re.compile(r'array[[]([0-1],|[0-1].0,)?(["a-z0-9_, .]+)[]]', flags=re.I)
    <stdin>:1: FutureWarning: Possible nested set at position 6
    ```
---
 src/ports/postgres/modules/pmml/formula.py_in      |  16 +-
 .../pmml/test/unit_tests/test_formula.py_in        | 177 +++++++++++++++++++++
 2 files changed, 191 insertions(+), 2 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/formula.py_in b/src/ports/postgres/modules/pmml/formula.py_in
index 0d575315..5f97bb51 100644
--- a/src/ports/postgres/modules/pmml/formula.py_in
+++ b/src/ports/postgres/modules/pmml/formula.py_in
@@ -12,8 +12,20 @@ class Formula(object):
         :param coef_len: Length of all the coefficients including the
                          intercept's coefficient(if any)
         """
-        # TODO: Fix the nested warning and add explanation for the regex
-        self.array_expr = re.compile(r'array[[]([0-1],|[0-1].0,)?(["a-z0-9_, .]+)[]]', flags=re.I)
+
+        self.array_expr = re.compile(r'array\[(\s*?[0-1]\s*?,\s*?|\s*?[0-1].0\s*?,\s*?)?(["a-z0-9_, .]+)]',
+                                     flags=re.I)
+        # Regex explanation:
+        # array\[ matches array[ or ARRAY[
+        # \s*? matches 0 or more spaces
+        # | represents an OR
+        # [0-1]\s*?, matches either "1," or "0," including spaces
+        # [0-1].0\s*?, matches either "1.0," or "0.0," including spaces
+        #   [0-1]\s*?,\s*?|\s*?[0-1].0\s*?, matches either "1", "0", "1.0", or "0.0" including spaces
+        # ()? captures the output of that group. ? means it's optional
+        #   That's why we use ()? for the first capture group i.e "1,", "0,", "1.0," or "0.0,"
+        # (["a-z0-9_, .]+) matches any occurrences of these characters and captures the output in a group
+
         self.non_array_expr = re.compile(r'["a-z0-9_]+', flags=re.I)
 
         self.intercept = self.has_intercept(x_str)
diff --git a/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in b/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in
index 6075edc4..2ce7b8ae 100644
--- a/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in
+++ b/src/ports/postgres/modules/pmml/test/unit_tests/test_formula.py_in
@@ -138,11 +138,22 @@ class FormulaTestCase(unittest.TestCase):
         self.assertEqual(f.intercept, True)
 
     def test_formula_array_with_invalid_intercept(self):
+        f = self.subject.Formula('baaz', 'ARRAY[0.1,foo,bar]', 3)
+        self.assertEqual(f.x, ['0.1', 'foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+
         f = self.subject.Formula('baaz', 'ARRAY[10,foo,bar]', 3)
         self.assertEqual(f.x, ['10', 'foo', 'bar'])
         self.assertEqual(f.y, "baaz")
         self.assertEqual(f.intercept, False)
 
+        f = self.subject.Formula('baaz', 'ARRAY[  10  , foo,bar]', 3)
+        self.assertEqual(f.x, ['10', 'foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
         # A negative number shouldn't be allowed technically the train functions
         # don't error out, so adding this test for the sake of completeness
         f = self.subject.Formula('baaz', 'ARRAY[-2,foo,bar]', 3)
@@ -203,6 +214,172 @@ class FormulaTestCase(unittest.TestCase):
         self.assertEqual(f.y, "baaz")
         self.assertEqual(f.intercept, False)
 
+    def test_formula_array_with_spaces_with_intercept(self):
+        f = self.subject.Formula('baaz', 'ARRAY[1 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[ 1,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1  , foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1,  "1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1 ,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1 ,  foo  ,  bar  ]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[ 1.0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0  , foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0,  "1.0",bar]', 3)
+        self.assertEqual(f.x, ['1.0', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0 ,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  1.0 ,  foo  ,  bar  ]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0 ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[1.0  ,"1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[ 0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0  ,foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[0,  "1",bar]', 3)
+        self.assertEqual(f.x, ['1', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0 ,  foo,bar]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  0 ,  foo  ,  bar  ]', 3)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, True)
+
+    def test_formula_array_with_spaces_without_intercept(self):
+        f = self.subject.Formula('baaz', 'ARRAY[  foo,bar]', 2)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+        f = self.subject.Formula('baaz', 'ARRAY[  foo ,  bar ]', 2)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+        f = self.subject.Formula('baaz', 'ARRAY[foo  ,bar]', 2)
+        self.assertEqual(f.x, ['foo', 'bar'])
+        self.assertEqual(f.y, "baaz")
+        self.assertEqual(f.intercept, False)
+
+
     def test_formula_nonarray(self):
         f = self.subject.Formula('baaz', 'foo', 3)
         self.assertEqual(f.x, ['foo[1]', 'foo[2]'])


(madlib) 05/09: PMML: Improve namespec dev-check tests

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 769f758e4db0b4e88112e0d976f09194365768ed
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Thu Feb 22 16:05:32 2024 -0800

    PMML: Improve namespec dev-check tests
    
    JIRA: MADLIB-1517
    
    This commit adds a few more namespec pmml tests
---
 .../postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in
index 24c883c6..8b96460f 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in
@@ -45,6 +45,11 @@ SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binom
 SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict', 'probability_true', '','foo1+foo2+foo3+foo4+foo5+foo6+foo7');
 SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict', 'probability_true', '','{bar,foo1,foo2,foo3,foo4,foo5,foo6,foo7}');
 SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict', 'probability_true', '','{foo1,foo2,foo3,foo4,foo5,foo6,foo7}');
+-- Even if we use explicit "1" in the formula, we will get the correct xml although it won't use the formula provided
+SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict', 'probability_true', '','bar ~ 1+foo1+foo2+foo3+foo4+foo5+foo6+foo7');
+SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict', 'probability_true', '','1+foo1+foo2+foo3+foo4+foo5+foo6+foo7');
+SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict', 'probability_true', '','{bar,1,foo1,foo2,foo3,foo4,foo5,foo6,foo7}');
+SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict', 'probability_true', '','{1,foo1,foo2,foo3,foo4,foo5,foo6,foo7}');
 
 -- Test output category
 DROP TABLE IF EXISTS glm_predict_binomial_logit_out; CREATE TABLE glm_predict_binomial_logit_out as SELECT id, glm_predict_binomial(coef, ARRAY[1, length, diameter, height, whole, shucked, viscera, shell], 'logit')
@@ -53,6 +58,10 @@ SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binom
 SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict_binomial', 'predicted_rings < 10', '', 'foo1+foo2+foo3+foo4+foo5+foo6+foo7');
 SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict_binomial', 'predicted_bar', '', '{bar,foo1,foo2,foo3,foo4,foo5,foo6,foo7}');
 SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict_binomial', 'predicted_rings < 10', '', '{foo1,foo2,foo3,foo4,foo5,foo6,foo7}');
+-- Even if we use explicit "1" in the formula, we will get the correct xml although it won't use the formula provided
+SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict_binomial', 'predicted_rings < 10', '', 'bar ~ 1+foo1+foo2+foo3+foo4+foo5+foo6+foo7');
+SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict_binomial', 'predicted_rings < 10', '', '1+foo1+foo2+foo3+foo4+foo5+foo6+foo7');
+SELECT test_pmml_output('abalone_test_for_pmml', 'glm_model', 'glm_predict_binomial_logit_out', 'id', 'glm_predict_binomial', 'predicted_rings < 10', '', '{1,bar,foo1,foo2,foo3,foo4,foo5,foo6,foo7}');
 
 ----------------------- without intercept -------------------------------
 DROP TABLE IF EXISTS glm_model, glm_model_summary;


(madlib) 01/09: PMML: Improve multinom dev-check tests

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit c9987434d4bef4dc468ac9bbead456e5103b4693
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Tue Feb 20 11:11:36 2024 -0800

    PMML: Improve multinom dev-check tests
    
    JIRA: MADLIB-1517
    
    This commit adds a few more multinom pmml tests including grouping and no
    intercept
---
 .../modules/pmml/test/pmml_multinom.sql_in         | 107 ++++++++++++++++++---
 1 file changed, 91 insertions(+), 16 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/test/pmml_multinom.sql_in b/src/ports/postgres/modules/pmml/test/pmml_multinom.sql_in
index 576d96ac..07f8e0a9 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_multinom.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_multinom.sql_in
@@ -216,27 +216,82 @@ INSERT INTO multinom_test(feat1, feat2, cat, g) VALUES
 (2,62,2,'B');
 ALTER TABLE multinom_test ADD COLUMN id SERIAL;
 
-DROP TABLE IF EXISTS mglm_out, mglm_out_summary;
+-- '0' as ref category
+DROP TABLE IF EXISTS multinom_model, multinom_model_summary;
 SELECT multinom(
     'multinom_test',
-    'mglm_out',
+    'multinom_model',
     'cat',
     'ARRAY[1, feat1, feat2]',
     '0',
     'logit');
-DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('mglm_out','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
-SELECT test_pmml_output('multinom_test', 'mglm_out', 'multinom_predict_out', 'id', '0', 'probability_0');
-SELECT test_pmml_output('multinom_test', 'mglm_out', 'multinom_predict_out', 'id', '1', 'probability_1');
-SELECT test_pmml_output('multinom_test', 'mglm_out', 'multinom_predict_out', 'id', '2', 'probability_2');
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '0', 'probability_0');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '1', 'probability_1');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '2', 'probability_2');
+
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
+
+-- '2' as ref category
+DROP TABLE IF EXISTS multinom_model, multinom_model_summary;
+SELECT multinom(
+    'multinom_test',
+    'multinom_model',
+    'cat',
+    'ARRAY[1, feat1, feat2]',
+    '2',
+    'logit', NULL,
+    'max_iter=100,optimizer=irls,tolerance=1e-6');
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '0', 'probability_0');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '1', 'probability_1');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '2', 'probability_2');
+
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
+
+------------------------------------ without intercept -------------------------------------------------------------------
+-- '0' as ref category
+DROP TABLE IF EXISTS multinom_model, multinom_model_summary;
+SELECT multinom(
+    'multinom_test',
+    'multinom_model',
+    'cat',
+    'ARRAY[feat1, feat2]',
+    '0',
+    'logit');
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '0', 'probability_0');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '1', 'probability_1');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '2', 'probability_2');
+
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
+
+-- '2' as ref category
+DROP TABLE IF EXISTS multinom_model, multinom_model_summary;
+SELECT multinom(
+    'multinom_test',
+    'multinom_model',
+    'cat',
+    'ARRAY[feat1, feat2]',
+    '2',
+    'logit');
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '0', 'probability_0');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '1', 'probability_1');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', '2', 'probability_2');
+
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
 
-DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('mglm_out','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
-SELECT test_pmml_output('multinom_test', 'mglm_out', 'multinom_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
 
 ------------------------------------ with grouping -------------------------------------------------------------------
--- DROP TABLE IF EXISTS mglm_grp_out, mglm_grp_out_summary;
+DROP TABLE IF EXISTS multinom_model, multinom_model_summary;
 SELECT multinom(
     'multinom_test',
-    'mglm_grp_out',
+    'multinom_model',
     'cat',
     'ARRAY[1, feat1, feat2]',
     '0',
@@ -244,10 +299,30 @@ SELECT multinom(
     'g');
 
 -- PMML export
-SELECT pmml('mglm_grp_out');
-DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('mglm_grp_out','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
-CREATE TABLE multinom_predict_grp_out as (SELECT a.id, a.g, b."0", b."1", b."2", a.feat1, a.feat2 FROM multinom_test AS a LEFT JOIN multinom_predict_out AS b on a.id =b.id);
-SELECT test_pmml_output('multinom_test', 'mglm_grp_out', 'multinom_predict_grp_out', 'id', '0', 'probability_0','g');
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
+DROP TABLE IF EXISTS multinom_predict_grp_out; CREATE TABLE multinom_predict_grp_out as (SELECT a.id, a.g, b."0", b."1", b."2", a.feat1, a.feat2 FROM multinom_test AS a LEFT JOIN multinom_predict_out AS b on a.id =b.id);
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_grp_out', 'id', '0', 'probability_0','g');
+
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
+DROP TABLE IF EXISTS multinom_predict_grp_out; CREATE TABLE multinom_predict_grp_out as (SELECT a.id, a.g, b.category FROM multinom_test AS a LEFT JOIN multinom_predict_out AS b on a.id =b.id);
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_grp_out', 'id', 'category', 'predicted_cat_pmml_prediction', 'g');
+
+---- without intercept -----
+DROP TABLE IF EXISTS multinom_model, multinom_model_summary;
+SELECT multinom(
+    'multinom_test',
+    'multinom_model',
+    'cat',
+    'ARRAY[feat1, feat2]',
+    '0',
+    'logit',
+    'g');
+
+-- PMML export
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'probability',FALSE,'id');
+DROP TABLE IF EXISTS multinom_predict_grp_out; CREATE TABLE multinom_predict_grp_out as (SELECT a.id, a.g, b."0", b."1", b."2", a.feat1, a.feat2 FROM multinom_test AS a LEFT JOIN multinom_predict_out AS b on a.id =b.id);
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_grp_out', 'id', '0', 'probability_0','g');
 
-DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('mglm_grp_out','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
-SELECT test_pmml_output('multinom_test', 'mglm_grp_out', 'multinom_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
+DROP TABLE IF EXISTS multinom_predict_out; SELECT multinom_predict('multinom_model','multinom_test', 'multinom_predict_out', 'response',FALSE,'id');
+DROP TABLE IF EXISTS multinom_predict_grp_out; CREATE TABLE multinom_predict_grp_out as (SELECT a.id, a.g, b.category FROM multinom_test AS a LEFT JOIN multinom_predict_out AS b on a.id =b.id);
+SELECT test_pmml_output('multinom_test', 'multinom_model', 'multinom_predict_grp_out', 'id', 'category', 'predicted_cat_pmml_prediction', 'g');


(madlib) 02/09: PMML: Improve ordinal dev-check tests

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 43775de29c01d83e9d491e2e1357603ca611294d
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Tue Feb 20 15:06:07 2024 -0800

    PMML: Improve ordinal dev-check tests
    
    JIRA: MADLIB-1517
    
    This commit adds a few more ordinal pmml tests that compare ordinal_predict's
    output with pypmml's output
---
 .../postgres/modules/pmml/test/pmml_ordinal.sql_in | 54 ++++++++++++++++------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/test/pmml_ordinal.sql_in b/src/ports/postgres/modules/pmml/test/pmml_ordinal.sql_in
index 37788782..ceac8291 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_ordinal.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_ordinal.sql_in
@@ -1,13 +1,20 @@
+\i m4_regexp(MADLIB_LIBRARY_PATH,
+             `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.sql_in'
+)
+
+m4_changequote(`<!'', `!>'')
+
 -- create table for training
-DROP TABLE IF EXISTS "Multinom_test";
-CREATE TABLE "Multinom_test" (
+DROP TABLE IF EXISTS "Ordinal_test";
+CREATE TABLE "Ordinal_test" (
     "FEAT1" INTEGER,
     feat2 INTEGER,
     cat INTEGER,
     g CHAR
 );
 
-INSERT INTO "Multinom_test"("FEAT1", feat2, cat, g) VALUES
+INSERT INTO "Ordinal_test"("FEAT1", feat2, cat, g) VALUES
 (1,35,1,'A'),
 (2,33,0,'A'),
 (3,39,1,'A'),
@@ -208,33 +215,50 @@ INSERT INTO "Multinom_test"("FEAT1", feat2, cat, g) VALUES
 (2,67,2,'B'),
 (2,65,2,'B'),
 (2,62,2,'B');
+ALTER TABLE "Ordinal_test" ADD COLUMN id SERIAL;
 
--- training function for logit link
+-- logit
 drop table if exists ordinal_logit, ordinal_logit_summary;
 SELECT ordinal(
-    '"Multinom_test"',
+    '"Ordinal_test"',
     'ordinal_logit',
     'cat',
     'ARRAY["FEAT1", feat2]',
     '0<1<2',
     'logit'
     );
+DROP TABLE IF EXISTS ordinal_predict_out; SELECT ordinal_predict('ordinal_logit','"Ordinal_test"', 'ordinal_predict_out', 'probability');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_logit', 'ordinal_predict_out', 'id', '0', 'probability_0');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_logit', 'ordinal_predict_out', 'id', '1', 'probability_1');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_logit', 'ordinal_predict_out', 'id', '2', 'probability_2');
+
+DROP TABLE IF EXISTS ordinal_predict_out; SELECT ordinal_predict('ordinal_logit','"Ordinal_test"', 'ordinal_predict_out', 'response');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_logit', 'ordinal_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
+
 
--- training function for probit link
+-- probit
 drop table if exists ordinal_probit, ordinal_probit_summary;
 SELECT ordinal(
-    '"Multinom_test"',
+    '"Ordinal_test"',
     'ordinal_probit',
     'cat',
     'ARRAY["FEAT1", feat2]',
-    '0<1<2',
+    '2<0<1',
     'probit'
     );
+DROP TABLE IF EXISTS ordinal_predict_out; SELECT ordinal_predict('ordinal_probit','"Ordinal_test"', 'ordinal_predict_out', 'probability');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_probit', 'ordinal_predict_out', 'id', '0', 'probability_0');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_probit', 'ordinal_predict_out', 'id', '1', 'probability_1');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_probit', 'ordinal_predict_out', 'id', '2', 'probability_2');
+
+DROP TABLE IF EXISTS ordinal_predict_out; SELECT ordinal_predict('ordinal_probit','"Ordinal_test"', 'ordinal_predict_out', 'response');
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_probit', 'ordinal_predict_out', 'id', 'category', 'predicted_cat_pmml_prediction');
+
 
--- training funcion for grouping case
+--------- grouping ----------------
 drop table if exists ordinal_logit_grp, ordinal_logit_grp_summary;
 SELECT ordinal(
-    '"Multinom_test"',
+    '"Ordinal_test"',
     'ordinal_logit_grp',
     'cat',
     'ARRAY["FEAT1", feat2]',
@@ -242,10 +266,12 @@ SELECT ordinal(
     'logit',
     'g'
     );
+DROP TABLE IF EXISTS ordinal_predict_out; SELECT ordinal_predict('ordinal_logit_grp','"Ordinal_test"', 'ordinal_predict_out', 'probability',FALSE);
+DROP TABLE IF EXISTS ordinal_predict_grp_out; CREATE TABLE ordinal_predict_grp_out as (SELECT a.id, a.g, b."0", b."1", b."2", a."FEAT1", a.feat2 FROM "Ordinal_test" AS a LEFT JOIN ordinal_predict_out AS b on a.id =b.id);
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_logit_grp', 'ordinal_predict_grp_out', 'id', '0', 'probability_0','g');
 
+DROP TABLE IF EXISTS ordinal_predict_out; SELECT ordinal_predict('ordinal_logit_grp','"Ordinal_test"', 'ordinal_predict_out', 'response',FALSE);
+DROP TABLE IF EXISTS ordinal_predict_grp_out; CREATE TABLE ordinal_predict_grp_out as (SELECT a.id, a.g, b.category FROM "Ordinal_test" AS a LEFT JOIN ordinal_predict_out AS b on a.id =b.id);
+SELECT test_pmml_output('"Ordinal_test"', 'ordinal_logit_grp', 'ordinal_predict_grp_out', 'id', 'category', 'predicted_cat_pmml_prediction', 'g');
 
--- pmml
-SELECT pmml('ordinal_logit');
-SELECT pmml('ordinal_probit');
-SELECT pmml('ordinal_logit_grp');
 


(madlib) 09/09: Update README to point to madlib2-master build

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 5b6f0033f1e43cad33acf8c30b303ef0d2f9da83
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Fri Mar 1 10:59:43 2024 -0800

    Update README to point to madlib2-master build
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1d988665..d186e892 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 It provides data-parallel implementations of mathematical, statistical and
 machine learning methods for structured and unstructured data.
 
-[![Build Status](https://ci-builds.apache.org/job/Madlib/job/madlib-build/job/master/badge/icon)](https://ci-builds.apache.org/job/Madlib/job/madlib-build/job/master/)
+[![Build Status](https://ci-builds.apache.org/job/Madlib/job/madlib-build/job/madlib2-master/badge/icon)](https://ci-builds.apache.org/job/Madlib/job/madlib-build/job/madlib2-master/)
 
 Installation and Contribution
 ==============================


(madlib) 07/09: PMML: Separate out datasets and setup functions

Posted by nk...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch madlib2-master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 0b75a0af162cf7c2e536bad3f92107f7748c325b
Author: Nikhil Kak <nk...@vmware.com>
AuthorDate: Mon Feb 26 23:17:01 2024 -0800

    PMML: Separate out datasets and setup functions
    
    JIRA: MADLIB-1517
    
    This commit separates out the setup function from the datasets used in the pmml
    dev-check tests. This is done to make it easier to just import the setup
    functions without having to also create the datasets which may not be used by
    all the modules like dt, rf etc.
---
 ...mml.setup.sql_in => pmml.setup.datasets.sql_in} | 138 +------------
 .../postgres/modules/pmml/test/pmml.setup.sql_in   | 228 ---------------------
 .../modules/pmml/test/pmml_check_fields.sql_in     |   5 +-
 .../modules/pmml/test/pmml_glm_binomial.sql_in     |   3 +
 .../modules/pmml/test/pmml_glm_gamma.sql_in        |   3 +
 .../postgres/modules/pmml/test/pmml_glm_ig.sql_in  |   3 +
 .../modules/pmml/test/pmml_glm_normal.sql_in       |   3 +
 .../modules/pmml/test/pmml_glm_poisson.sql_in      |   3 +
 .../pmml/test/pmml_glm_with_grouping.sql_in        |   3 +
 .../pmml/test/pmml_glm_with_name_spec.sql_in       |   3 +
 .../pmml/test/pmml_intercept_as_predictor.sql_in   |   3 +
 .../postgres/modules/pmml/test/pmml_linear.sql_in  |   3 +
 .../modules/pmml/test/pmml_logistic.sql_in         |   3 +
 .../test/pmml_with_non_array_expression.sql_in     |   3 +
 14 files changed, 38 insertions(+), 366 deletions(-)

diff --git a/src/ports/postgres/modules/pmml/test/pmml.setup.sql_in b/src/ports/postgres/modules/pmml/test/pmml.setup.datasets.sql_in
similarity index 66%
copy from src/ports/postgres/modules/pmml/test/pmml.setup.sql_in
copy to src/ports/postgres/modules/pmml/test/pmml.setup.datasets.sql_in
index dad6a03f..1c60bd94 100644
--- a/src/ports/postgres/modules/pmml/test/pmml.setup.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml.setup.datasets.sql_in
@@ -224,140 +224,4 @@ INSERT INTO "Patients"(ID, "Second_attack", "Treatment", trait_anxiety, g1, g2)
 (17, 0, 0, 55, 'B', 'z'),
 (18, 0, 0, 45, 'B', 'z'),
 (19, 0, 0, 50, 'B', 'z'),
-(20, 0, 0, 60, 'B', 'z');
-/* ----------------------------------------------------------------------- *//**
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *//* ----------------------------------------------------------------------- */
-
-/**
- * @brief Compares the madlib prediction output with pypmml prediction output.
- * Assumes that the caller has already run madlib's predict function on the test table
- * Creates a dictionary of the madlib results by querying the madlib_predict_table
- * Calls the pmml function on the trained madlib table
- * Loops through all the rows of the test table and FOR EACH ROW:
-    ** Gets the madlib prediction from the dict created above
-    ** Runs pypmml predict
-    ** Compares madlib and pypmml results
-    ** Throws an error if the values don't match
- * @param test_table The test table which was used for madlib prediction. This table will be used for pmml prediction
- * @param madlib_train_table The trained model table
- * @param madlib_predict_table The madlib prediction table
- * @param id_col id column in the test table used for uniquely identifying the rows for comparison
- * @param madlib_prediction_metric Name of the column in madlib prediction table that contains the value to compare
- * @param pypmml_prediction_metric Name of the key in pmml prediction dict that contains the value to compare
- * @param grouping_col grouping col used during training
- * @param name_spec name_spec to be passed to the madlib pmml function
- * @returns
- *
- */
-CREATE OR REPLACE FUNCTION test_pmml_output(test_table TEXT, madlib_train_table TEXT, madlib_predict_table TEXT,
-       id_col TEXT, madlib_prediction_metric TEXT, pypmml_prediction_metric TEXT, grouping_col TEXT, name_spec TEXT) returns VOID as $$
-    # This function returns a key so that we can compare the results from the madlib prediction table with the pmml
-    # result that gets run on the test table
-    #  The key of this dict is either just the 'id' or a combination of 'id' and all the grouping cold
-    def get_unique_key(input):
-        unique_key = [str(input[id_col])]
-        if grouping_col != '':
-            cols = grouping_col.split(',')
-            for col in cols:
-                unique_key.append(str(input[col]))
-        return ','.join(unique_key)
-
-    def are_results_different(madlib_result, pypmml_result):
-        from math import isnan, isinf
-        if isinstance(madlib_result, str):
-            return str(pypmml_result) != str(madlib_result)
-        elif isinstance(madlib_result, float) or isinstance(madlib_result, int):
-            if isinf(float(madlib_result)) or isnan(float(madlib_result)):
-                return True
-            if isinf(float(pypmml_result)) or isnan(float(pypmml_result)):
-                return True
-            tol = 1e-6
-            return abs(pypmml_result - madlib_result) > tol
-        return True
-
-    from pypmml import Model
-
-    madlib_predict_output_table = plpy.execute("SELECT * from {}".format(madlib_predict_table))
-    madlib_predict_output = {}
-    for madlib_pred in madlib_predict_output_table:
-        madlib_predict_output[get_unique_key(madlib_pred)] = madlib_pred[madlib_prediction_metric]
-
-    #get madlib pmml output string
-    if name_spec == '':
-        pmml_query = "SELECT pmml('{}')".format(madlib_train_table)
-    else:
-        pmml_query = "SELECT pmml('{}','{}')".format(madlib_train_table, name_spec)
-    madlib_pmml_str = plpy.execute(pmml_query)[0]["pmml"]
-
-    # load pypmml model using madlib pmml string
-    pypmml_model = Model.fromString(madlib_pmml_str)
-
-    # load data and run pypmml predict
-    test_data = plpy.execute("SELECT * from {}".format(test_table))
-    for data in test_data:
-        madlib_result = madlib_predict_output[get_unique_key(data)]
-        pypmml_prediction = pypmml_model.predict(data)
-        if pypmml_prediction_metric not in pypmml_prediction:
-            plpy.error("Metric: '{}' does not exist in pypmml output: {}".format(pypmml_prediction_metric, pypmml_prediction))
-        pypmml_result = pypmml_prediction[pypmml_prediction_metric]
-        if are_results_different(madlib_result, pypmml_result):
-            plpy.info(madlib_pmml_str)
-            plpy.error("pmml comparison failed. input row: {}, metric to compare: {}, madlib result: {}, pypmml result: {}, detailed madlib output: {}, detailed pypmml output: {}".format(data, pypmml_prediction_metric, madlib_result, pypmml_result, madlib_predict_output, pypmml_prediction))
-$$ language plpython3u;
-
-CREATE OR REPLACE FUNCTION test_pmml_output(test_table TEXT, madlib_train_table TEXT, madlib_predict_table TEXT,
-       id_col TEXT, madlib_prediction_metric TEXT, pypmml_prediction_metric TEXT, grouping_col TEXT) returns VOID as $$
-    SELECT test_pmml_output(test_table, madlib_train_table, madlib_predict_table, id_col, madlib_prediction_metric, pypmml_prediction_metric, grouping_col, '')
-$$ language sql;
-
-CREATE OR REPLACE FUNCTION test_pmml_output(test_table TEXT, madlib_train_table TEXT, madlib_predict_table TEXT,
-       id_col TEXT, madlib_prediction_metric TEXT, pypmml_prediction_metric TEXT) returns VOID as $$
-    SELECT test_pmml_output(test_table, madlib_train_table, madlib_predict_table, id_col, madlib_prediction_metric, pypmml_prediction_metric, '', '')
-$$ language sql;
-
-
-CREATE OR REPLACE FUNCTION count_expected_names(madlib_train_table varchar, formula varchar, expected varchar[])
-RETURNS bigint AS $$
-    SELECT count(distinct result.name)
-    FROM
-        (SELECT text(unnest(xpath('/n:PMML/n:DataDictionary/n:DataField/@name',
-                pmml_col, ARRAY[ARRAY['n', 'http://www.dmg.org/PMML-4_1']]))) AS name
-        FROM (SELECT CASE WHEN $2 is NULL THEN pmml(madlib_train_table)
-                          ELSE pmml(madlib_train_table, $2)
-                     END AS pmml_col) pmml_output
-        ) result,
-        (SELECT unnest($3) AS name) expected
-    WHERE expected.name = result.name;
-$$ LANGUAGE sql;
-
-CREATE OR REPLACE FUNCTION count_expected_names(madlib_train_table varchar, name_spec varchar[], expected varchar[])
-RETURNS bigint AS $$
-    SELECT count(distinct result.name)
-    FROM
-        (SELECT text(unnest(xpath('/n:PMML/n:DataDictionary/n:DataField/@name',
-                pmml_col, ARRAY[ARRAY['n', 'http://www.dmg.org/PMML-4_1']]))) AS name
-        FROM (SELECT CASE WHEN $2 is NULL THEN pmml(madlib_train_table)
-                          ELSE pmml(madlib_train_table, $2)
-                     END AS pmml_col) pmml_output
-        ) result,
-        (SELECT unnest($3) AS name) expected
-    WHERE expected.name = result.name;
-$$ LANGUAGE sql;
+(20, 0, 0, 60, 'B', 'z');
\ No newline at end of file
diff --git a/src/ports/postgres/modules/pmml/test/pmml.setup.sql_in b/src/ports/postgres/modules/pmml/test/pmml.setup.sql_in
index dad6a03f..6688ef5f 100644
--- a/src/ports/postgres/modules/pmml/test/pmml.setup.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml.setup.sql_in
@@ -1,230 +1,3 @@
-/* ----------------------------------------------------------------------- *//**
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *//* ----------------------------------------------------------------------- */
-
- DROP TABLE IF EXISTS houses CASCADE;
-CREATE TABLE houses (
-    id SERIAL NOT NULL,
-    tax INTEGER,
-    bedroom REAL,
-    bath REAL,
-    price INTEGER,
-    size INTEGER,
-    lot INTEGER,
-    g1 CHAR(1),
-    g2 CHAR(1)
-);
-
-INSERT INTO houses(tax, bedroom, bath, price, size, lot, g1, g2) VALUES
-( 590, 2, 1,    50000,  770, 22100, 'A', 'z'),
-(1050, 3, 2,    85000, 1410, 12000, 'A', 'y'),
-(20, 3, 1,    22500, 1060, 3500 , 'A', 'y'),
-( 870, 2, 2,    90000, 1300, 17500, 'A', 'y'),
-(1320, 3, 2,   133000, 1500, 30000, 'A', 'y'),
-(1350, 2, 1,    90500,  820, 25700, 'A', 'x'),
-(2790, 3, 2.5, 260000, 2130, 25000, 'A', 'x'),
-( 680, 2, 1,   142500, 1170, 22000, 'A', 'x'),
-(1840, 3, 2,   160000, 1500, 19000, 'B', 'x'),
-(3680, 4, 2,   240000, 2790, 20000, 'B', 'y'),
-(1660, 3, 1,    87000, 1030, 17500, 'B', 'y'),
-(1620, 3, 2,   118600, 1250, 20000, 'A', 'y'),
-(3100, 3, 2,   140000, 1760, 38000, 'B', 'y'),
-(2070, 2, 3,   148000, 1550, 14000, 'B', 'y'),
-( 650, 3, 1.5,  65000, 1450, 12000, 'B', 'y');
-
-DROP TABLE IF EXISTS abalone CASCADE;
-CREATE TABLE abalone (
-    id integer,
-    sex text,
-    length double precision,
-    diameter double precision,
-    height double precision,
-    whole double precision,
-    shucked double precision,
-    viscera double precision,
-    shell double precision,
-    rings integer
-);
-
-INSERT INTO abalone VALUES
-(3151, 'F', 0.655000000000000027, 0.505000000000000004, 0.165000000000000008, 1.36699999999999999, 0.583500000000000019, 0.351499999999999979, 0.396000000000000019, 10),
-(2026, 'F', 0.550000000000000044, 0.469999999999999973, 0.149999999999999994, 0.920499999999999985, 0.381000000000000005, 0.243499999999999994, 0.267500000000000016, 10),
-(3751, 'I', 0.434999999999999998, 0.375, 0.110000000000000001, 0.41549999999999998, 0.170000000000000012, 0.0759999999999999981, 0.14499999999999999, 8),
-(720, 'I', 0.149999999999999994, 0.100000000000000006, 0.0250000000000000014, 0.0149999999999999994, 0.00449999999999999966, 0.00400000000000000008, 0.0050000000000000001, 2),
-(1635, 'F', 0.574999999999999956, 0.469999999999999973, 0.154999999999999999, 1.1160000000000001, 0.509000000000000008, 0.237999999999999989, 0.340000000000000024, 10),
-(2648, 'I', 0.5, 0.390000000000000013, 0.125, 0.582999999999999963, 0.293999999999999984, 0.132000000000000006, 0.160500000000000004, 8),
-(1796, 'F', 0.57999999999999996, 0.429999999999999993, 0.170000000000000012, 1.47999999999999998, 0.65349999999999997, 0.32400000000000001, 0.41549999999999998, 10),
-(209, 'F', 0.525000000000000022, 0.41499999999999998, 0.170000000000000012, 0.832500000000000018, 0.275500000000000023, 0.168500000000000011, 0.309999999999999998, 13),
-(1451, 'I', 0.455000000000000016, 0.33500000000000002, 0.135000000000000009, 0.501000000000000001, 0.274000000000000021, 0.0995000000000000051, 0.106499999999999997, 7),
-(1108, 'I', 0.510000000000000009, 0.380000000000000004, 0.115000000000000005, 0.515499999999999958, 0.214999999999999997, 0.113500000000000004, 0.166000000000000009, 8),
-(3675, 'F', 0.594999999999999973, 0.450000000000000011, 0.165000000000000008, 1.08099999999999996, 0.489999999999999991, 0.252500000000000002, 0.279000000000000026, 12),
-(2108, 'F', 0.675000000000000044, 0.550000000000000044, 0.179999999999999993, 1.68849999999999989, 0.562000000000000055, 0.370499999999999996, 0.599999999999999978, 15),
-(3312, 'F', 0.479999999999999982, 0.380000000000000004, 0.135000000000000009, 0.507000000000000006, 0.191500000000000004, 0.13650000000000001, 0.154999999999999999, 12),
-(882, 'M', 0.655000000000000027, 0.520000000000000018, 0.165000000000000008, 1.40949999999999998, 0.585999999999999965, 0.290999999999999981, 0.405000000000000027, 9),
-(3402, 'M', 0.479999999999999982, 0.395000000000000018, 0.149999999999999994, 0.681499999999999995, 0.214499999999999996, 0.140500000000000014, 0.2495, 18),
-(829, 'I', 0.409999999999999976, 0.325000000000000011, 0.100000000000000006, 0.394000000000000017, 0.20799999999999999, 0.0655000000000000027, 0.105999999999999997, 6),
-(1305, 'M', 0.535000000000000031, 0.434999999999999998, 0.149999999999999994, 0.716999999999999971, 0.347499999999999976, 0.14449999999999999, 0.194000000000000006, 9),
-(3613, 'M', 0.599999999999999978, 0.46000000000000002, 0.179999999999999993, 1.1399999999999999, 0.422999999999999987, 0.257500000000000007, 0.364999999999999991, 10),
-(1068, 'I', 0.340000000000000024, 0.265000000000000013, 0.0800000000000000017, 0.201500000000000012, 0.0899999999999999967, 0.0475000000000000006, 0.0550000000000000003, 5),
-(2446, 'M', 0.5, 0.380000000000000004, 0.135000000000000009, 0.583500000000000019, 0.22950000000000001, 0.126500000000000001, 0.179999999999999993, 12),
-(1393, 'M', 0.635000000000000009, 0.474999999999999978, 0.170000000000000012, 1.19350000000000001, 0.520499999999999963, 0.269500000000000017, 0.366499999999999992, 10),
-(359, 'M', 0.744999999999999996, 0.584999999999999964, 0.214999999999999997, 2.49900000000000011, 0.92649999999999999, 0.471999999999999975, 0.699999999999999956, 17),
-(549, 'F', 0.564999999999999947, 0.450000000000000011, 0.160000000000000003, 0.79500000000000004, 0.360499999999999987, 0.155499999999999999, 0.23000000000000001, 12),
-(1154, 'F', 0.599999999999999978, 0.474999999999999978, 0.160000000000000003, 1.02649999999999997, 0.484999999999999987, 0.2495, 0.256500000000000006, 9),
-(1790, 'F', 0.54500000000000004, 0.385000000000000009, 0.149999999999999994, 1.11850000000000005, 0.542499999999999982, 0.244499999999999995, 0.284499999999999975, 9),
-(3703, 'F', 0.665000000000000036, 0.540000000000000036, 0.195000000000000007, 1.76400000000000001, 0.850500000000000034, 0.361499999999999988, 0.469999999999999973, 11),
-(1962, 'F', 0.655000000000000027, 0.515000000000000013, 0.179999999999999993, 1.41199999999999992, 0.619500000000000051, 0.248499999999999999, 0.496999999999999997, 11),
-(1665, 'I', 0.604999999999999982, 0.469999999999999973, 0.14499999999999999, 0.802499999999999991, 0.379000000000000004, 0.226500000000000007, 0.220000000000000001, 9),
-(635, 'M', 0.359999999999999987, 0.294999999999999984, 0.100000000000000006, 0.210499999999999993, 0.0660000000000000031, 0.0524999999999999981, 0.0749999999999999972, 9),
-(3901, 'M', 0.445000000000000007, 0.344999999999999973, 0.140000000000000013, 0.475999999999999979, 0.205499999999999988, 0.101500000000000007, 0.108499999999999999, 15),
-(2734, 'I', 0.41499999999999998, 0.33500000000000002, 0.100000000000000006, 0.357999999999999985, 0.169000000000000011, 0.067000000000000004, 0.104999999999999996, 7),
-(3856, 'M', 0.409999999999999976, 0.33500000000000002, 0.115000000000000005, 0.440500000000000003, 0.190000000000000002, 0.0850000000000000061, 0.135000000000000009, 8),
-(827, 'I', 0.395000000000000018, 0.28999999999999998, 0.0950000000000000011, 0.303999999999999992, 0.127000000000000002, 0.0840000000000000052, 0.076999999999999999, 6),
-(3381, 'I', 0.190000000000000002, 0.130000000000000004, 0.0449999999999999983, 0.0264999999999999993, 0.00899999999999999932, 0.0050000000000000001, 0.00899999999999999932, 5),
-(3972, 'I', 0.400000000000000022, 0.294999999999999984, 0.0950000000000000011, 0.252000000000000002, 0.110500000000000001, 0.0575000000000000025, 0.0660000000000000031, 6),
-(1155, 'M', 0.599999999999999978, 0.455000000000000016, 0.170000000000000012, 1.1915, 0.695999999999999952, 0.239499999999999991, 0.239999999999999991, 8),
-(3467, 'M', 0.640000000000000013, 0.5, 0.170000000000000012, 1.4544999999999999, 0.642000000000000015, 0.357499999999999984, 0.353999999999999981, 9),
-(2433, 'F', 0.609999999999999987, 0.484999999999999987, 0.165000000000000008, 1.08699999999999997, 0.425499999999999989, 0.232000000000000012, 0.380000000000000004, 11),
-(552, 'I', 0.614999999999999991, 0.489999999999999991, 0.154999999999999999, 0.988500000000000045, 0.41449999999999998, 0.195000000000000007, 0.344999999999999973, 13),
-(1425, 'F', 0.729999999999999982, 0.57999999999999996, 0.190000000000000002, 1.73750000000000004, 0.678499999999999992, 0.434499999999999997, 0.520000000000000018, 11),
-(2402, 'F', 0.584999999999999964, 0.41499999999999998, 0.154999999999999999, 0.69850000000000001, 0.299999999999999989, 0.145999999999999991, 0.195000000000000007, 12),
-(1748, 'F', 0.699999999999999956, 0.535000000000000031, 0.174999999999999989, 1.77299999999999991, 0.680499999999999994, 0.479999999999999982, 0.512000000000000011, 15),
-(3983, 'I', 0.57999999999999996, 0.434999999999999998, 0.149999999999999994, 0.891499999999999959, 0.362999999999999989, 0.192500000000000004, 0.251500000000000001, 6),
-(335, 'F', 0.739999999999999991, 0.599999999999999978, 0.195000000000000007, 1.97399999999999998, 0.597999999999999976, 0.408499999999999974, 0.709999999999999964, 16),
-(1587, 'I', 0.515000000000000013, 0.349999999999999978, 0.104999999999999996, 0.474499999999999977, 0.212999999999999995, 0.122999999999999998, 0.127500000000000002, 10),
-(2448, 'I', 0.275000000000000022, 0.204999999999999988, 0.0800000000000000017, 0.096000000000000002, 0.0359999999999999973, 0.0184999999999999991, 0.0299999999999999989, 6),
-(1362, 'F', 0.604999999999999982, 0.474999999999999978, 0.174999999999999989, 1.07600000000000007, 0.463000000000000023, 0.219500000000000001, 0.33500000000000002, 9),
-(2799, 'M', 0.640000000000000013, 0.484999999999999987, 0.149999999999999994, 1.09800000000000009, 0.519499999999999962, 0.222000000000000003, 0.317500000000000004, 10),
-(1413, 'F', 0.67000000000000004, 0.505000000000000004, 0.174999999999999989, 1.01449999999999996, 0.4375, 0.271000000000000019, 0.3745, 10),
-(1739, 'F', 0.67000000000000004, 0.540000000000000036, 0.195000000000000007, 1.61899999999999999, 0.739999999999999991, 0.330500000000000016, 0.465000000000000024, 11),
-(1152, 'M', 0.584999999999999964, 0.465000000000000024, 0.160000000000000003, 0.955500000000000016, 0.45950000000000002, 0.235999999999999988, 0.265000000000000013, 7),
-(2427, 'I', 0.564999999999999947, 0.434999999999999998, 0.154999999999999999, 0.782000000000000028, 0.271500000000000019, 0.16800000000000001, 0.284999999999999976, 14),
-(1777, 'M', 0.484999999999999987, 0.369999999999999996, 0.154999999999999999, 0.967999999999999972, 0.418999999999999984, 0.245499999999999996, 0.236499999999999988, 9),
-(3294, 'M', 0.574999999999999956, 0.455000000000000016, 0.184999999999999998, 1.15599999999999992, 0.552499999999999991, 0.242999999999999994, 0.294999999999999984, 13),
-(1403, 'M', 0.650000000000000022, 0.510000000000000009, 0.190000000000000002, 1.54200000000000004, 0.715500000000000025, 0.373499999999999999, 0.375, 9),
-(2256, 'M', 0.510000000000000009, 0.395000000000000018, 0.14499999999999999, 0.61850000000000005, 0.215999999999999998, 0.138500000000000012, 0.239999999999999991, 12),
-(3984, 'F', 0.584999999999999964, 0.450000000000000011, 0.125, 0.873999999999999999, 0.354499999999999982, 0.20749999999999999, 0.225000000000000006, 6),
-(1116, 'M', 0.525000000000000022, 0.405000000000000027, 0.119999999999999996, 0.755499999999999949, 0.3755, 0.155499999999999999, 0.201000000000000012, 9),
-(1366, 'M', 0.609999999999999987, 0.474999999999999978, 0.170000000000000012, 1.02649999999999997, 0.434999999999999998, 0.233500000000000013, 0.303499999999999992, 10),
-(3759, 'I', 0.525000000000000022, 0.400000000000000022, 0.140000000000000013, 0.605500000000000038, 0.260500000000000009, 0.107999999999999999, 0.209999999999999992, 9);
-
-
-DROP TABLE IF EXISTS warpbreaks CASCADE;
-CREATE TABLE warpbreaks(
-    id      serial,
-    breaks  integer,
-    wool    char(1),
-    tension char(1),
-    g       char(1)
-);
-
-INSERT INTO warpbreaks(breaks, wool, tension, g) VALUES
-(26, 'A', 'L', '1'),
-(30, 'A', 'L', '1'),
-(54, 'A', 'L', '1'),
-(25, 'A', 'L', '1'),
-(70, 'A', 'L', '1'),
-(52, 'A', 'L', '1'),
-(51, 'A', 'L', '1'),
-(26, 'A', 'L', '1'),
-(67, 'A', 'L', '1'),
-(18, 'A', 'M', '1'),
-(21, 'A', 'M', '1'),
-(29, 'A', 'M', '1'),
-(17, 'A', 'M', '1'),
-(12, 'A', 'M', '1'),
-(18, 'A', 'M', '1'),
-(35, 'A', 'M', '1'),
-(30, 'A', 'M', '1'),
-(36, 'A', 'M', '1'),
-(36, 'A', 'H', '0'),
-(21, 'A', 'H', '0'),
-(24, 'A', 'H', '0'),
-(18, 'A', 'H', '0'),
-(10, 'A', 'H', '0'),
-(43, 'A', 'H', '0'),
-(28, 'A', 'H', '0'),
-(15, 'A', 'H', '0'),
-(26, 'A', 'H', '0'),
-(27, 'B', 'L', '0'),
-(14, 'B', 'L', '0'),
-(29, 'B', 'L', '0'),
-(19, 'B', 'L', '0'),
-(29, 'B', 'L', '0'),
-(31, 'B', 'L', '0'),
-(41, 'B', 'L', '0'),
-(20, 'B', 'L', '1'),
-(44, 'B', 'L', '1'),
-(42, 'B', 'M', '1'),
-(26, 'B', 'M', '1'),
-(19, 'B', 'M', '1'),
-(16, 'B', 'M', '1'),
-(39, 'B', 'M', '1'),
-(28, 'B', 'M', '1'),
-(21, 'B', 'M', '1'),
-(39, 'B', 'M', '1'),
-(29, 'B', 'M', '1'),
-(20, 'B', 'H', '1'),
-(21, 'B', 'H', '1'),
-(24, 'B', 'H', '1'),
-(17, 'B', 'H', '1'),
-(13, 'B', 'H', '1'),
-(15, 'B', 'H', '1'),
-(15, 'B', 'H', '1'),
-(16, 'B', 'H', '1'),
-(28, 'B', 'H', '1');
-
-DROP TABLE IF EXISTS warpbreaks_dummy;
-SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 'wool,tension');
-
-DROP TABLE IF EXISTS "Patients";
-CREATE TABLE "Patients" (
-    id integer NOT NULL,
-    "Second_attack" integer,
-    "Treatment" integer,
-    trait_anxiety integer,
-    g1 char(1),
-    g2 char(1)
-);
-
-INSERT INTO "Patients"(ID, "Second_attack", "Treatment", trait_anxiety, g1, g2) VALUES
-( 1, 1, 1, 70, 'A', 'x'),
-( 2, 1, 1, 80, 'A', 'y'),
-( 3, 1, 1, 50, 'A', 'y'),
-( 4, 1, 0, 60, 'A', 'y'),
-( 5, 1, 0, 40, 'A', 'y'),
-( 6, 1, 0, 65, 'A', 'x'),
-( 7, 1, 0, 75, 'A', 'x'),
-( 8, 1, 0, 80, 'B', 'x'),
-( 9, 1, 0, 70, 'A', 'x'),
-(10, 1, 0, 60, 'A', 'z'),
-(11, 0, 1, 65, 'A', 'z'),
-(12, 0, 1, 50, 'B', 'y'),
-(13, 0, 1, 45, 'A', 'y'),
-(14, 0, 1, 35, 'A', 'z'),
-(15, 0, 1, 40, 'A', 'z'),
-(16, 0, 1, 50, 'A', 'z'),
-(17, 0, 0, 55, 'B', 'z'),
-(18, 0, 0, 45, 'B', 'z'),
-(19, 0, 0, 50, 'B', 'z'),
-(20, 0, 0, 60, 'B', 'z');
 /* ----------------------------------------------------------------------- *//**
  *
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -333,7 +106,6 @@ CREATE OR REPLACE FUNCTION test_pmml_output(test_table TEXT, madlib_train_table
     SELECT test_pmml_output(test_table, madlib_train_table, madlib_predict_table, id_col, madlib_prediction_metric, pypmml_prediction_metric, '', '')
 $$ language sql;
 
-
 CREATE OR REPLACE FUNCTION count_expected_names(madlib_train_table varchar, formula varchar, expected varchar[])
 RETURNS bigint AS $$
     SELECT count(distinct result.name)
diff --git a/src/ports/postgres/modules/pmml/test/pmml_check_fields.sql_in b/src/ports/postgres/modules/pmml/test/pmml_check_fields.sql_in
index 8381398a..8d800841 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_check_fields.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_check_fields.sql_in
@@ -19,9 +19,12 @@
  *
  *//* ----------------------------------------------------------------------- */
 
- \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+ \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_binomial.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_binomial.sql_in
index 3c17815c..78cd8d08 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_binomial.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_binomial.sql_in
@@ -2,6 +2,9 @@
              `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+ \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_gamma.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_gamma.sql_in
index e10a827d..2a00d630 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_gamma.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_gamma.sql_in
@@ -2,6 +2,9 @@
              `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_ig.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_ig.sql_in
index a99b3056..c031a856 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_ig.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_ig.sql_in
@@ -2,6 +2,9 @@
              `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_normal.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_normal.sql_in
index 7616c644..84e84433 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_normal.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_normal.sql_in
@@ -2,6 +2,9 @@
              `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_poisson.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_poisson.sql_in
index 2dcae6d6..85b75492 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_poisson.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_poisson.sql_in
@@ -2,6 +2,9 @@
              `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_with_grouping.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_with_grouping.sql_in
index b13f6e77..c16f54c7 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_with_grouping.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_with_grouping.sql_in
@@ -23,6 +23,9 @@
              `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 --TODO: Should all the predict and model tables have the same name ?
diff --git a/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in b/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in
index 8b96460f..79d9f0f6 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_glm_with_name_spec.sql_in
@@ -23,6 +23,9 @@
              `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in b/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in
index dc0e28bd..1eb445f7 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_intercept_as_predictor.sql_in
@@ -22,6 +22,9 @@
  \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_linear.sql_in b/src/ports/postgres/modules/pmml/test/pmml_linear.sql_in
index a870e373..304ccba5 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_linear.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_linear.sql_in
@@ -22,6 +22,9 @@
  \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_logistic.sql_in b/src/ports/postgres/modules/pmml/test/pmml_logistic.sql_in
index 8e9f2cbd..2efd8129 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_logistic.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_logistic.sql_in
@@ -22,6 +22,9 @@
  \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')
 
diff --git a/src/ports/postgres/modules/pmml/test/pmml_with_non_array_expression.sql_in b/src/ports/postgres/modules/pmml/test/pmml_with_non_array_expression.sql_in
index 1a7769d7..3dcd8014 100644
--- a/src/ports/postgres/modules/pmml/test/pmml_with_non_array_expression.sql_in
+++ b/src/ports/postgres/modules/pmml/test/pmml_with_non_array_expression.sql_in
@@ -22,6 +22,9 @@
  \i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
               `\1/../modules/pmml/test/pmml.setup.sql_in'
 )
+\i m4_regexp(MADLIB_LIBRARY_PATH, `\(.*\)/lib',
+              `\1/../modules/pmml/test/pmml.setup.datasets.sql_in'
+)
 
 m4_changequote(`<!'', `!>'')