You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2018/06/01 01:48:31 UTC
[3/3] madlib git commit: Logregr: Report error if output table is
empty
Logregr: Report error if output table is empty
JIRA MADLIB-1172
When the model cannot be generated due to ill-conditioned input data,
the output table doesn't get populated. In this case, we report back an
error instead of creating the empty table.
Closes #270
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/abef95ec
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/abef95ec
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/abef95ec
Branch: refs/heads/master
Commit: abef95ec99d2797fa7a51c8d4548d88a656d7364
Parents: ef52d87
Author: Himanshu Pandey <hp...@pivotal.io>
Authored: Thu May 31 18:44:41 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu May 31 18:47:32 2018 -0700
----------------------------------------------------------------------
.../postgres/modules/regress/logistic.py_in | 157 +++++++++----------
.../modules/regress/test/logistic.sql_in | 69 ++------
2 files changed, 92 insertions(+), 134 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/abef95ec/src/ports/postgres/modules/regress/logistic.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/logistic.py_in b/src/ports/postgres/modules/regress/logistic.py_in
index 76cbb6a..77ea465 100644
--- a/src/ports/postgres/modules/regress/logistic.py_in
+++ b/src/ports/postgres/modules/regress/logistic.py_in
@@ -153,7 +153,8 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
plpy.error("Logregr error: Invalid output table name!")
if (table_exists(tbl_output, only_first_schema=True)):
- plpy.error("Output table name already exists. Drop the table before calling the function.")
+ plpy.error("Output table name already exists. Drop the table before "
+ "calling the function.")
if not dep_col or dep_col.strip().lower() in ('null', ''):
plpy.error("Logregr error: Invalid dependent column name!")
@@ -164,7 +165,6 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
if not ind_col or ind_col.lower() in ('null', ''):
plpy.error("Logregr error: Invalid independent column name!")
-
if grouping_col is not None:
if grouping_col == '':
plpy.error("Logregr error: Invalid grouping columns name!")
@@ -173,14 +173,14 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
plpy.error("Logregr error: Grouping column does not exist!")
intersect = frozenset(_string_to_array(grouping_col)).intersection(
- frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
- 'p_values', 'odds_ratios', 'condition_no',
- 'num_processed', 'num_missing_rows_skipped',
- 'variance_covariance')))
+ frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
+ 'p_values', 'odds_ratios', 'condition_no',
+ 'num_processed', 'num_missing_rows_skipped',
+ 'variance_covariance')))
if len(intersect) > 0:
plpy.error("Logregr error: Conflicted grouping column name.\n"
"Predefined name(s) {0} are not allow!".format(
- ', '.join(intersect)))
+ ', '.join(intersect)))
if max_iter <= 0:
plpy.error("Logregr error: Maximum number of iterations must be positive!")
@@ -231,12 +231,12 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
'cg': "__logregr_cg_result",
'igd': "__logregr_igd_result"}
- plpy.execute("select {schema_madlib}.create_schema_pg_temp()".format(**args))
- plpy.execute(
- """
- drop table if exists pg_temp.{tbl_logregr_args};
- create table pg_temp.{tbl_logregr_args} as
- select
+ plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".
+ format(**args))
+ plpy.execute("""
+ DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
+ CREATE TABLE pg_temp.{tbl_logregr_args} as
+ SELECT
{max_iter} as max_iter,
{tolerance} as tolerance
""".format(**args))
@@ -257,7 +257,8 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
dep_col, ind_col, optimizer,
grouping_col=grouping_col,
grouping_str=grouping_str,
- col_grp_iteration=args["col_grp_iteration"],
+ col_grp_iteration=args[
+ "col_grp_iteration"],
col_grp_state=args["col_grp_state"])
grouping_str1 = "" if grouping_col is None else grouping_col + ","
@@ -265,65 +266,61 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")"
join_str = "," if grouping_str1 == "" else "join "
- plpy.execute(
- """
- drop table if exists {tbl_output};
- create table {tbl_output} as
- select
+ plpy.execute("""
+ DROP TABLE IF EXISTS {tbl_output};
+ CREATE TABLE {tbl_output} as
+ SELECT
{grouping_str1}
- (case when (result).status = 1 then (result).coef
- else NULL::double precision[] end) as coef,
- (case when (result).status = 1 then (result).log_likelihood
- else NULL::double precision end) as log_likelihood,
- (case when (result).status = 1 then (result).std_err
- else NULL::double precision[] end) as std_err,
- (case when (result).status = 1 then (result).z_stats
- else NULL::double precision[] end) as z_stats,
- (case when (result).status = 1 then (result).p_values
- else NULL::double precision[] end) as p_values,
- (case when (result).status = 1 then (result).odds_ratios
- else NULL::double precision[] end) as odds_ratios,
- (case when (result).status = 1 then (result).condition_no
- else NULL::double precision end) as condition_no,
- (case when (result).status = 1 then (result).num_processed
- when result is NULL then 0
- else NULL::bigint end) as num_rows_processed,
- (case when (result).status = 1 then num_rows - (result).num_processed
- when result is null then num_rows
- else NULL::bigint end) as num_missing_rows_skipped,
+ (CASE WHEN (result).status = 1 THEN (result).coef
+ ELSE NULL::double precision[] END) as coef,
+ (CASE WHEN (result).status = 1 THEN (result).log_likelihood
+ ELSE NULL::double precision END) as log_likelihood,
+ (CASE WHEN (result).status = 1 THEN (result).std_err
+ ELSE NULL::double precision[] END) as std_err,
+ (CASE WHEN (result).status = 1 THEN (result).z_stats
+ ELSE NULL::double precision[] END) as z_stats,
+ (CASE WHEN (result).status = 1 THEN (result).p_values
+ ELSE NULL::double precision[] END) as p_values,
+ (CASE WHEN (result).status = 1 THEN (result).odds_ratios
+ ELSE NULL::double precision[] END) as odds_ratios,
+ (CASE WHEN (result).status = 1 THEN (result).condition_no
+ ELSE NULL::double precision END) as condition_no,
+ (CASE WHEN (result).status = 1 THEN (result).num_processed
+ when result is NULL THEN 0
+ ELSE NULL::bigint end) AS num_rows_processed,
+ (CASE WHEN (result).status = 1 THEN num_rows - (result).num_processed
+ when result is null THEN num_rows
+ ELSE NULL::bigint end) AS num_missing_rows_skipped,
{col_grp_iteration} as num_iterations,
- (case when (result).status = 1 then (result).vcov
- else NULL::double precision[] end) as variance_covariance
- from
+ (CASE WHEN (result).status = 1 THEN (result).vcov
+ ELSE NULL::double precision[] END) as variance_covariance
+ FROM
(
- select
- {col_grp_iteration}, {grouping_str1} result, num_rows
- from
- (
- (select
+ SELECT {col_grp_iteration}, {grouping_str1} result, num_rows
+ FROM (
+ (SELECT
{grouping_str1}
{schema_madlib}.{fnName}({col_grp_state}) as result,
{col_grp_iteration}
- from
+ FROM
{tbl_logregr_state}
) t
- join
- (
- select
+ JOIN
+ (SELECT
{grouping_str1}
max({col_grp_iteration}) as {col_grp_iteration}
- from {tbl_logregr_state}
- group by {grouping_str2}
+ FROM {tbl_logregr_state}
+ GROUP BY {grouping_str2}
) s
- using ({grouping_str1} {col_grp_iteration})
+ USING ({grouping_str1} {col_grp_iteration})
) q1
{join_str}
(
- select
+ SELECT
{grouping_str1}
count(*) num_rows
- from {tbl_source}
- group by {grouping_str2}
+ FROM {tbl_source}
+ GROUP BY {grouping_str2}
) q2
{using_str}
) q3
@@ -336,21 +333,28 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
**args))
failed_groups = plpy.execute("""
- select count(*) as count
- from {tbl_output}
- where coef is Null
- """.format(**args))[0]["count"]
+ SELECT count(*) as count
+ FROM {tbl_output}
+ WHERE coef IS NULL
+ """.format(tbl_output=tbl_output))[0]["count"]
all_groups = plpy.execute("""
- select count(*) as count
- from {tbl_output}
- """.format(**args))[0]["count"]
-
- num_rows = plpy.execute(
- """
- select
- sum(num_rows_processed) as num_rows_processed,
- sum(num_missing_rows_skipped) as num_missing_rows_skipped
- from {tbl_output}
+ SELECT count(*) AS count
+ FROM {tbl_output}
+ """.format(**args))[0]["count"]
+
+ # JIRA: MADLIB-1172: When no model can be generated due to ill-conditioned
+ # input data, the output table dosen't get populated. In That
+ # case, report back an error instead of a successful empty table.
+ if failed_groups == all_groups or all_groups == 0:
+ plpy.execute("DROP TABLE IF EXISTS " + str(tbl_output))
+ plpy.error("Logregr error: No model created possibly "
+ "due to ill-conditioned data.")
+
+ num_rows = plpy.execute("""
+ SELECT
+ sum(num_rows_processed) AS num_rows_processed,
+ sum(num_missing_rows_skipped) AS num_missing_rows_skipped
+ FROM {tbl_output}
""".format(tbl_output=tbl_output))[0]
if num_rows['num_rows_processed'] is None:
@@ -380,14 +384,9 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
grouping_col="'" + grouping_col + "'" if grouping_col else "NULL",
**args))
- # if grouping_col:
- # plpy.info(str(all_groups - failed_groups) +
- # " groups succesfully passed, and " +
- # str(failed_groups) + " groups failed")
-
plpy.execute("""
- drop table if exists pg_temp.{tbl_logregr_args};
- drop table if exists pg_temp.{tbl_logregr_state}
+ DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
+ DROP TABLE IF EXISTS pg_temp.{tbl_logregr_state}
""".format(**args))
plpy.execute("set client_min_messages to " + old_msg_level)
@@ -517,7 +516,7 @@ SELECT * from patients_logregr;
help_string = "No such option. Use {schema_madlib}.logregr_train('help')"
return help_string.format(schema_madlib=schema_madlib)
-## ========================================================================
+# ========================================================================
def logregr_predict_help(schema_madlib, message, **kwargs):
http://git-wip-us.apache.org/repos/asf/madlib/blob/abef95ec/src/ports/postgres/modules/regress/test/logistic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/test/logistic.sql_in b/src/ports/postgres/modules/regress/test/logistic.sql_in
index b029404..697dce4 100644
--- a/src/ports/postgres/modules/regress/test/logistic.sql_in
+++ b/src/ports/postgres/modules/regress/test/logistic.sql_in
@@ -720,22 +720,8 @@ select logregr_train(
0
);
--- Even though we were far more generous for the conjugate-gradient optimizer,
--- but it still generates failures intermittently. We comment it out here
--- to acknowledge CG may generate un-converged results.
--- SELECT
--- assert(relative_error(coef, ARRAY[-3.989979, 0.002264, 0.804038, -0.675443, -1.340204, -1.551464]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong coef'),
--- assert(relative_error(log_likelihood, -229.2587) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong log_likelihood'),
--- assert(relative_error(std_err, ARRAY[1.139951, 0.001094, 0.331819, 0.316490, 0.345306, 0.417832]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong std_err'),
--- assert(relative_error(z_stats, ARRAY[-3.500, 2.070, 2.423, -2.134, -3.881, -3.713]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong z-stats'),
--- assert(relative_error(p_values, ARRAY[0.000465, 0.038465, 0.015388, 0.032829, 0.000104, 0.000205]) < 1, 'Logistic regression with CG optimizer (grad_school): Wrong p-values'),
--- assert(relative_error(odds_ratios, ARRAY[0.0185001, 1.0022670, 2.2345448, 0.5089310, 0.2617923, 0.2119375]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong odds_ratios')
--- FROM temp_result;
-
--- IGD essentially does not work for this case, so we are not testing it
-
--- insert some NULL values for null handling testing
-insert into grad_school (admit, gre, gpa, rank) values
+--- insert some NULL values for null handling testing
+INSERT INTO grad_school (admit, gre, gpa, rank) VALUES
(NULL, NULL, 3, 4),
(1, NULL, 3, 5);
@@ -748,44 +734,17 @@ select logregr_train(
'ARRAY[1, gre, gpa]',
'rank');
-DROP TABLE IF EXISTS all_null_patients;
-CREATE TABLE all_null_patients(
- id INTEGER NOT NULL,
- second_attack INTEGER,
- treatment INTEGER,
- trait_anxiety INTEGER
-);
-INSERT INTO all_null_patients(id, second_attack, treatment, trait_anxiety) VALUES
-(0, NULL, 1, 70),
-(0, NULL, 1, 80),
-(0, NULL, 1, 50),
-(0, NULL, 0, 60),
-(0, NULL, 0, 40),
-(0, NULL, 0, 65),
-(0, NULL, 0, 75),
-(0, NULL, 0, 80),
-(0, NULL, 0, 70),
-(0, NULL, 0, 60),
-(1, NULL, 1, 65),
-(1, NULL, 1, 50),
-(1, NULL, 1, 45),
-(1, NULL, 1, 35),
-(1, NULL, 1, 40),
-(1, NULL, 1, 50),
-(1, NULL, 0, 55),
-(1, NULL, 0, 45),
-(1, NULL, 0, 50),
-(1, NULL, 0, 60);
+-- Even though we were far more generous for the conjugate-gradient optimizer,
+-- but it still generates failures intermittently. We comment it out here
+-- to acknowledge CG may generate un-converged results.
+-- SELECT
+-- assert(relative_error(coef, ARRAY[-3.989979, 0.002264, 0.804038, -0.675443, -1.340204, -1.551464]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong coef'),
+-- assert(relative_error(log_likelihood, -229.2587) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong log_likelihood'),
+-- assert(relative_error(std_err, ARRAY[1.139951, 0.001094, 0.331819, 0.316490, 0.345306, 0.417832]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong std_err'),
+-- assert(relative_error(z_stats, ARRAY[-3.500, 2.070, 2.423, -2.134, -3.881, -3.713]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong z-stats'),
+-- assert(relative_error(p_values, ARRAY[0.000465, 0.038465, 0.015388, 0.032829, 0.000104, 0.000205]) < 1, 'Logistic regression with CG optimizer (grad_school): Wrong p-values'),
+-- assert(relative_error(odds_ratios, ARRAY[0.0185001, 1.0022670, 2.2345448, 0.5089310, 0.2617923, 0.2119375]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong odds_ratios')
+-- FROM temp_result;
-drop table if exists temp_result;
-drop table if exists temp_result_summary;
-select logregr_train(
- 'all_null_patients',
- 'temp_result',
- 'second_attack',
- 'ARRAY[1, treatment, trait_anxiety]',
- 'id',
- 20,
- 'cg'
-);
+-- IGD essentially does not work for this case, so we are not testing it