You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2018/06/01 01:48:31 UTC

[3/3] madlib git commit: Logregr: Report error if output table is empty

Logregr: Report error if output table is empty

JIRA MADLIB-1172

When the model cannot be generated due to ill-conditioned input data,
the output table doesn't get populated.  In this case, we report back an
error instead of creating the empty table.

Closes #270


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/abef95ec
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/abef95ec
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/abef95ec

Branch: refs/heads/master
Commit: abef95ec99d2797fa7a51c8d4548d88a656d7364
Parents: ef52d87
Author: Himanshu Pandey <hp...@pivotal.io>
Authored: Thu May 31 18:44:41 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu May 31 18:47:32 2018 -0700

----------------------------------------------------------------------
 .../postgres/modules/regress/logistic.py_in     | 157 +++++++++----------
 .../modules/regress/test/logistic.sql_in        |  69 ++------
 2 files changed, 92 insertions(+), 134 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/abef95ec/src/ports/postgres/modules/regress/logistic.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/logistic.py_in b/src/ports/postgres/modules/regress/logistic.py_in
index 76cbb6a..77ea465 100644
--- a/src/ports/postgres/modules/regress/logistic.py_in
+++ b/src/ports/postgres/modules/regress/logistic.py_in
@@ -153,7 +153,8 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
         plpy.error("Logregr error: Invalid output table name!")
 
     if (table_exists(tbl_output, only_first_schema=True)):
-        plpy.error("Output table name already exists. Drop the table before calling the function.")
+        plpy.error("Output table name already exists. Drop the table before "
+                   "calling the function.")
 
     if not dep_col or dep_col.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid dependent column name!")
@@ -164,7 +165,6 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
     if not ind_col or ind_col.lower() in ('null', ''):
         plpy.error("Logregr error: Invalid independent column name!")
 
-
     if grouping_col is not None:
         if grouping_col == '':
             plpy.error("Logregr error: Invalid grouping columns name!")
@@ -173,14 +173,14 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
             plpy.error("Logregr error: Grouping column does not exist!")
 
         intersect = frozenset(_string_to_array(grouping_col)).intersection(
-                frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
-                           'p_values', 'odds_ratios', 'condition_no',
-                           'num_processed', 'num_missing_rows_skipped',
-                           'variance_covariance')))
+            frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
+                       'p_values', 'odds_ratios', 'condition_no',
+                       'num_processed', 'num_missing_rows_skipped',
+                       'variance_covariance')))
         if len(intersect) > 0:
             plpy.error("Logregr error: Conflicted grouping column name.\n"
                        "Predefined name(s) {0} are not allow!".format(
-                            ', '.join(intersect)))
+                           ', '.join(intersect)))
 
     if max_iter <= 0:
         plpy.error("Logregr error: Maximum number of iterations must be positive!")
@@ -231,12 +231,12 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
             'cg': "__logregr_cg_result",
             'igd': "__logregr_igd_result"}
 
-    plpy.execute("select {schema_madlib}.create_schema_pg_temp()".format(**args))
-    plpy.execute(
-        """
-        drop table if exists pg_temp.{tbl_logregr_args};
-        create table pg_temp.{tbl_logregr_args} as
-            select
+    plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".
+                 format(**args))
+    plpy.execute("""
+        DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
+        CREATE TABLE pg_temp.{tbl_logregr_args} as
+            SELECT
                 {max_iter} as max_iter,
                 {tolerance} as tolerance
         """.format(**args))
@@ -257,7 +257,8 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                                       dep_col, ind_col, optimizer,
                                       grouping_col=grouping_col,
                                       grouping_str=grouping_str,
-                                      col_grp_iteration=args["col_grp_iteration"],
+                                      col_grp_iteration=args[
+                                          "col_grp_iteration"],
                                       col_grp_state=args["col_grp_state"])
 
     grouping_str1 = "" if grouping_col is None else grouping_col + ","
@@ -265,65 +266,61 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
     using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")"
     join_str = "," if grouping_str1 == "" else "join "
 
-    plpy.execute(
-        """
-        drop table if exists {tbl_output};
-        create table {tbl_output} as
-            select
+    plpy.execute("""
+        DROP TABLE IF EXISTS {tbl_output};
+        CREATE TABLE {tbl_output} as
+            SELECT
                 {grouping_str1}
-                (case when (result).status = 1 then (result).coef
-                    else NULL::double precision[] end) as coef,
-                (case when (result).status = 1 then (result).log_likelihood
-                    else NULL::double precision end) as log_likelihood,
-                (case when (result).status = 1 then (result).std_err
-                    else NULL::double precision[] end) as std_err,
-                (case when (result).status = 1 then (result).z_stats
-                    else NULL::double precision[] end) as z_stats,
-                (case when (result).status = 1 then (result).p_values
-                    else NULL::double precision[] end) as p_values,
-                (case when (result).status = 1 then (result).odds_ratios
-                    else NULL::double precision[] end) as odds_ratios,
-                (case when (result).status = 1 then (result).condition_no
-                    else NULL::double precision end) as condition_no,
-                (case when (result).status = 1 then (result).num_processed
-                    when result is NULL then 0
-                    else NULL::bigint end) as num_rows_processed,
-                (case when (result).status = 1 then num_rows - (result).num_processed
-                    when result is null then num_rows
-                    else NULL::bigint end) as num_missing_rows_skipped,
+                (CASE WHEN (result).status = 1 THEN (result).coef
+                    ELSE NULL::double precision[] END) as coef,
+                (CASE WHEN (result).status = 1 THEN (result).log_likelihood
+                    ELSE NULL::double precision END) as log_likelihood,
+                (CASE WHEN (result).status = 1 THEN (result).std_err
+                    ELSE NULL::double precision[] END) as std_err,
+                (CASE WHEN (result).status = 1 THEN (result).z_stats
+                    ELSE NULL::double precision[] END) as z_stats,
+                (CASE WHEN (result).status = 1 THEN (result).p_values
+                    ELSE NULL::double precision[] END) as p_values,
+                (CASE WHEN (result).status = 1 THEN (result).odds_ratios
+                    ELSE NULL::double precision[] END) as odds_ratios,
+                (CASE WHEN (result).status = 1 THEN (result).condition_no
+                    ELSE NULL::double precision END) as condition_no,
+                (CASE WHEN (result).status = 1 THEN (result).num_processed
+                    when result is NULL THEN 0
+                    ELSE NULL::bigint end) AS num_rows_processed,
+                (CASE WHEN (result).status = 1 THEN num_rows - (result).num_processed
+                    when result is null THEN num_rows
+                    ELSE NULL::bigint end) AS num_missing_rows_skipped,
                 {col_grp_iteration} as num_iterations,
-                (case when (result).status = 1 then (result).vcov
-                    else NULL::double precision[] end) as variance_covariance
-            from
+                (CASE WHEN (result).status = 1 THEN (result).vcov
+                    ELSE NULL::double precision[] END) as variance_covariance
+            FROM
             (
-                select
-                    {col_grp_iteration}, {grouping_str1} result, num_rows
-                from
-                (
-                    (select
+                SELECT {col_grp_iteration}, {grouping_str1} result, num_rows
+                FROM (
+                    (SELECT
                         {grouping_str1}
                         {schema_madlib}.{fnName}({col_grp_state}) as result,
                         {col_grp_iteration}
-                    from
+                    FROM
                         {tbl_logregr_state}
                     ) t
-                    join
-                    (
-                    select
+                    JOIN
+                    (SELECT
                         {grouping_str1}
                         max({col_grp_iteration}) as {col_grp_iteration}
-                    from {tbl_logregr_state}
-                    group by {grouping_str2}
+                    FROM {tbl_logregr_state}
+                    GROUP BY {grouping_str2}
                     ) s
-                    using ({grouping_str1} {col_grp_iteration})
+                    USING ({grouping_str1} {col_grp_iteration})
                 ) q1
                 {join_str}
                 (
-                    select
+                    SELECT
                         {grouping_str1}
                         count(*) num_rows
-                    from {tbl_source}
-                    group by {grouping_str2}
+                    FROM {tbl_source}
+                    GROUP BY {grouping_str2}
                 ) q2
                 {using_str}
             ) q3
@@ -336,21 +333,28 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                    **args))
 
     failed_groups = plpy.execute("""
-                        select count(*) as count
-                        from {tbl_output}
-                        where coef is Null
-                        """.format(**args))[0]["count"]
+        SELECT count(*) as count
+        FROM {tbl_output}
+        WHERE coef IS NULL
+        """.format(tbl_output=tbl_output))[0]["count"]
     all_groups = plpy.execute("""
-                    select count(*) as count
-                    from {tbl_output}
-                    """.format(**args))[0]["count"]
-
-    num_rows = plpy.execute(
-        """
-        select
-            sum(num_rows_processed) as num_rows_processed,
-            sum(num_missing_rows_skipped) as num_missing_rows_skipped
-        from {tbl_output}
+        SELECT count(*) AS count
+        FROM {tbl_output}
+        """.format(**args))[0]["count"]
+
+    # JIRA: MADLIB-1172: When no model can be generated due to ill-conditioned
+    # input data, the output table dosen't get populated. In That
+    # case, report back an error instead of a successful empty table.
+    if failed_groups == all_groups or all_groups == 0:
+        plpy.execute("DROP TABLE IF EXISTS " + str(tbl_output))
+        plpy.error("Logregr error: No model created possibly "
+                   "due to ill-conditioned data.")
+
+    num_rows = plpy.execute("""
+        SELECT
+            sum(num_rows_processed) AS num_rows_processed,
+            sum(num_missing_rows_skipped) AS num_missing_rows_skipped
+        FROM {tbl_output}
         """.format(tbl_output=tbl_output))[0]
 
     if num_rows['num_rows_processed'] is None:
@@ -380,14 +384,9 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                    grouping_col="'" + grouping_col + "'" if grouping_col else "NULL",
                    **args))
 
-    # if grouping_col:
-    #     plpy.info(str(all_groups - failed_groups) +
-    #                 " groups succesfully passed, and " +
-    #                 str(failed_groups) + " groups failed")
-
     plpy.execute("""
-                 drop table if exists pg_temp.{tbl_logregr_args};
-                 drop table if exists pg_temp.{tbl_logregr_state}
+                 DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
+                 DROP TABLE IF EXISTS pg_temp.{tbl_logregr_state}
                  """.format(**args))
 
     plpy.execute("set client_min_messages to " + old_msg_level)
@@ -517,7 +516,7 @@ SELECT * from patients_logregr;
         help_string = "No such option. Use {schema_madlib}.logregr_train('help')"
 
     return help_string.format(schema_madlib=schema_madlib)
-## ========================================================================
+# ========================================================================
 
 
 def logregr_predict_help(schema_madlib, message, **kwargs):

http://git-wip-us.apache.org/repos/asf/madlib/blob/abef95ec/src/ports/postgres/modules/regress/test/logistic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/test/logistic.sql_in b/src/ports/postgres/modules/regress/test/logistic.sql_in
index b029404..697dce4 100644
--- a/src/ports/postgres/modules/regress/test/logistic.sql_in
+++ b/src/ports/postgres/modules/regress/test/logistic.sql_in
@@ -720,22 +720,8 @@ select logregr_train(
     0
 );
 
--- Even though we were far more generous for the conjugate-gradient optimizer,
--- but it still generates failures intermittently. We comment it out here
--- to acknowledge CG may generate un-converged results.
--- SELECT
---     assert(relative_error(coef, ARRAY[-3.989979, 0.002264, 0.804038, -0.675443, -1.340204, -1.551464]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong coef'),
---     assert(relative_error(log_likelihood, -229.2587) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong log_likelihood'),
---     assert(relative_error(std_err, ARRAY[1.139951, 0.001094, 0.331819, 0.316490, 0.345306, 0.417832]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong std_err'),
---     assert(relative_error(z_stats, ARRAY[-3.500, 2.070, 2.423, -2.134, -3.881, -3.713]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong z-stats'),
---     assert(relative_error(p_values, ARRAY[0.000465, 0.038465, 0.015388, 0.032829, 0.000104, 0.000205]) < 1, 'Logistic regression with CG optimizer (grad_school): Wrong p-values'),
---     assert(relative_error(odds_ratios, ARRAY[0.0185001, 1.0022670, 2.2345448, 0.5089310, 0.2617923, 0.2119375]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong odds_ratios')
--- FROM temp_result;
-
--- IGD essentially does not work for this case, so we are not testing it
-
--- insert some NULL values for null handling testing
-insert into grad_school (admit, gre, gpa, rank) values
+--- insert some NULL values for null handling testing
+INSERT INTO grad_school (admit, gre, gpa, rank) VALUES
 (NULL, NULL, 3, 4),
 (1, NULL, 3, 5);
 
@@ -748,44 +734,17 @@ select logregr_train(
     'ARRAY[1, gre, gpa]',
     'rank');
 
-DROP TABLE IF EXISTS all_null_patients;
-CREATE TABLE all_null_patients(
-    id INTEGER NOT NULL,
-    second_attack INTEGER,
-    treatment INTEGER,
-    trait_anxiety INTEGER
-);
 
-INSERT INTO all_null_patients(id, second_attack, treatment, trait_anxiety) VALUES
-(0, NULL, 1, 70),
-(0, NULL, 1, 80),
-(0, NULL, 1, 50),
-(0, NULL, 0, 60),
-(0, NULL, 0, 40),
-(0, NULL, 0, 65),
-(0, NULL, 0, 75),
-(0, NULL, 0, 80),
-(0, NULL, 0, 70),
-(0, NULL, 0, 60),
-(1, NULL, 1, 65),
-(1, NULL, 1, 50),
-(1, NULL, 1, 45),
-(1, NULL, 1, 35),
-(1, NULL, 1, 40),
-(1, NULL, 1, 50),
-(1, NULL, 0, 55),
-(1, NULL, 0, 45),
-(1, NULL, 0, 50),
-(1, NULL, 0, 60);
+-- Even though we were far more generous for the conjugate-gradient optimizer,
+-- but it still generates failures intermittently. We comment it out here
+-- to acknowledge CG may generate un-converged results.
+-- SELECT
+--     assert(relative_error(coef, ARRAY[-3.989979, 0.002264, 0.804038, -0.675443, -1.340204, -1.551464]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong coef'),
+--     assert(relative_error(log_likelihood, -229.2587) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong log_likelihood'),
+--     assert(relative_error(std_err, ARRAY[1.139951, 0.001094, 0.331819, 0.316490, 0.345306, 0.417832]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong std_err'),
+--     assert(relative_error(z_stats, ARRAY[-3.500, 2.070, 2.423, -2.134, -3.881, -3.713]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong z-stats'),
+--     assert(relative_error(p_values, ARRAY[0.000465, 0.038465, 0.015388, 0.032829, 0.000104, 0.000205]) < 1, 'Logistic regression with CG optimizer (grad_school): Wrong p-values'),
+--     assert(relative_error(odds_ratios, ARRAY[0.0185001, 1.0022670, 2.2345448, 0.5089310, 0.2617923, 0.2119375]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong odds_ratios')
+-- FROM temp_result;
 
-drop table if exists temp_result;
-drop table if exists temp_result_summary;
-select logregr_train(
-    'all_null_patients',
-    'temp_result',
-    'second_attack',
-    'ARRAY[1, treatment, trait_anxiety]',
-    'id',
-    20,
-    'cg'
-);
+-- IGD essentially does not work for this case, so we are not testing it