You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@madlib.apache.org by ri...@apache.org on 2018/06/01 01:48:29 UTC

[1/3] madlib git commit: DT: Don't use NULL value to get dep_var type

Repository: madlib
Updated Branches:
  refs/heads/master ccc3a1832 -> abef95ec9


DT: Don't use NULL value to get dep_var type

JIRA: MADLIB-1233

Function `_is_dep_categorical` is used to obtain the type of the
dependent variable expression. This function gets a random value using
`LIMIT 1` and checks the type of the corresponding value in Python.
Further this does not filter out NULL values.
Since NULL values are not filtered out,
it's possible the `LIMIT 1` returns a "None" type in Python, leading to
incorrect results.

This commit updates the type extraction by checking the type in the
database instead of in Python and also filters out NULL values.
Additionally it checks if at least one non-NULL value is obtained, else
throws an appropriate error.


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/26f61e91
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/26f61e91
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/26f61e91

Branch: refs/heads/master
Commit: 26f61e9110f12804c76ca707f52f1774d8844a7c
Parents: ccc3a18
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue May 1 14:24:34 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu May 31 17:03:30 2018 -0700

----------------------------------------------------------------------
 .../recursive_partitioning/decision_tree.py_in  |  18 +-
 .../recursive_partitioning/decision_tree.sql_in | 206 +++++++++----------
 .../modules/utilities/validate_args.py_in       |  27 ++-
 3 files changed, 135 insertions(+), 116 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/26f61e91/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 6f64234..48b8fab 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -31,7 +31,7 @@ from utilities.utilities import _assert
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
 from utilities.utilities import add_postfix
-from utilities.utilities import is_psql_numeric_type
+from utilities.utilities import is_psql_numeric_type, is_psql_boolean_type
 from utilities.utilities import split_quoted_delimited_str
 from utilities.utilities import py_list_to_sql_string
 # ------------------------------------------------------------
@@ -56,6 +56,11 @@ def _tree_validate_args(
             "Decision tree error: Invalid data table.")
     _assert(table_exists(training_table_name),
             "Decision tree error: Data table is missing.")
+    _assert(not table_is_empty(training_table_name,
+                               _get_filter_str(dependent_variable, grouping_cols)),
+            "Decision tree error: Data table ({0}) is empty "
+            "(after filtering invalid tuples)".
+            format(training_table_name))
 
     _assert(not table_exists(output_table_name, only_first_schema=True),
             "Decision tree error: Output table already exists.")
@@ -567,10 +572,13 @@ def _is_dep_categorical(training_table_name, dependent_variable):
     @brief Sample the dependent variable to check whether it is
     a categorical variable.
     """
-    sample_dep = plpy.execute("SELECT " + dependent_variable +
-                              " AS dep FROM " +
-                              training_table_name + " LIMIT 1")[0]['dep']
-    return (not isinstance(sample_dep, float), isinstance(sample_dep, bool))
+    sample_dep = get_expr_type(dependent_variable, training_table_name)
+    is_dep_numeric = is_psql_numeric_type(sample_dep,
+                                          exclude=['smallint',
+                                                   'integer',
+                                                   'bigint'])
+    is_dep_bool = is_psql_boolean_type(sample_dep)
+    return (not is_dep_numeric, is_dep_bool)
 # ------------------------------------------------------------
 
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/26f61e91/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index a3c4963..8e69d9b 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -25,7 +25,7 @@ m4_include(`SQLCommon.m4')
 </ul></div>
 
 @brief
-Decision trees are tree-based supervised learning methods 
+Decision trees are tree-based supervised learning methods
 that can be used for classification and regression.
 
 A decision tree is a supervised learning method that can be used for
@@ -37,7 +37,7 @@ or regression rules.
 
 @anchor train
 @par Training Function
-We implement the decision tree using the CART algorithm 
+We implement the decision tree using the CART algorithm
 introduced by Breiman et al. [1].
 The training function has the following syntax:
 <pre class="syntax">
@@ -68,7 +68,7 @@ tree_train(
   <dt>output_table_name</dt>
   <dd>TEXT. Name of the generated table containing the model.
   If a table with the same name already exists, an
-  error will be returned. A summary table 
+  error will be returned. A summary table
   named <em>\<output_table_name\>_summary</em> is also
   created.  A cross-validation table <em>\<output_table_name\>_cv</em>
   may also be created.  These are described later on this page.
@@ -99,7 +99,7 @@ tree_train(
 
   Array columns can also be included in the list, where the array is expanded
   to treat each element of the array as a feature.
-  
+
   Note that not every combination of the levels of a
   categorical variable is checked when evaluating a split. The levels of the
   non-integer categorical variable are ordered by the entropy of the variable in
@@ -121,9 +121,9 @@ tree_train(
   <DD>TEXT, default = 'gini' for classification, 'mse' for regression.
   Impurity function to compute the feature to use to split a node.
   Supported criteria are 'gini', 'entropy', 'misclassification' for
-  classification trees. For regression trees, split_criterion 
-  of 'mse' (mean-squared error) is always used, irrespective of 
-  the input for this argument. 
+  classification trees. For regression trees, split_criterion
+  of 'mse' (mean-squared error) is always used, irrespective of
+  the input for this argument.
   Refer to reference [1] for more information on impurity measures.</DD>
 
   <DT>grouping_cols (optional)</DT>
@@ -132,15 +132,15 @@ tree_train(
       each group. </DD>
 
   <DT>weights (optional)</DT>
-  <DD>TEXT. Column name containing numerical weights for 
-  each observation.  Can be any value greater 
-  than 0 (does not need to be an integer).  
+  <DD>TEXT. Column name containing numerical weights for
+  each observation.  Can be any value greater
+  than 0 (does not need to be an integer).
   This can be used to handle the case of unbalanced data sets.
-  The weights are used to compute a weighted average in 
-  the output leaf node. For classification, the contribution 
-  of a row towards the vote of its corresponding level 
-  is multiplied by the weight (weighted mode). For regression, 
-  the output value of the row is multiplied by 
+  The weights are used to compute a weighted average in
+  the output leaf node. For classification, the contribution
+  of a row towards the vote of its corresponding level
+  is multiplied by the weight (weighted mode). For regression,
+  the output value of the row is multiplied by
   the weight (weighted mean).</DD>
 
   <DT>max_depth (optional)</DT>
@@ -173,10 +173,10 @@ tree_train(
       <tr>
       <th>cp</th>
       <td>
-        Default: 0. Complexity parameter.  
+        Default: 0. Complexity parameter.
         A split on a node is attempted only if it
-        decreases the overall lack of fit by a factor of 'cp', 
-        otherwise the split is pruned away. This value is used 
+        decreases the overall lack of fit by a factor of 'cp',
+        otherwise the split is pruned away. This value is used
         to create an initial tree before running
         cross-validation (see below).
 
@@ -190,7 +190,7 @@ tree_train(
         <em>cp</em>. To perform cross-validation, a positive value of
         <em>n_folds</em> (2 or more) should be specified. An additional output
         table <em>\<model_table\>_cv</em> is created containing the values of
-        evaluated <em>cp</em> and the cross-validation error 
+        evaluated <em>cp</em> and the cross-validation error
         statistics. The tree returned
         in the output table corresponds to the <em>cp</em> with the lowest
         cross-validation error (we pick the maximum <em>cp</em> if multiple
@@ -215,23 +215,23 @@ tree_train(
       <th>max_surrogates</th>
       <td>Default: 0. Number of surrogates to store for each node.</td>
       One approach to handling NULLs is to use surrogate splits for each
-      node. A surrogate variable enables you to make better use of 
+      node. A surrogate variable enables you to make better use of
       the data by using another predictor variable that is associated
       (correlated) with the primary split variable.  The surrogate
       variable comes into use when the primary predictior value is NULL.
-      Surrogate rules implemented here are based on reference [1]. 
+      Surrogate rules implemented here are based on reference [1].
     </tr>
     <tr>
     <th>null_as_category</th>
-    <td>Default: FALSE. Whether to treat NULL as a valid level 
+    <td>Default: FALSE. Whether to treat NULL as a valid level
     for categorical features.  FALSE means that NULL is not a
-    valid level, which is probably the most common sitation. 
-    
+    valid level, which is probably the most common sitation.
+
     If set to TRUE, NULL values are considered a categorical value and
     placed at the end of the ordering of categorical levels. Placing at the
     end ensures that NULL is never used as a value to split a node on.
-    One reason to make NULL a category is that it allows you to 
-    predict on categorical levels that were not in the training 
+    One reason to make NULL a category is that it allows you to
+    predict on categorical levels that were not in the training
     data by lumping them into an "other bucket."
 
     This parameter is ignored for continuous-valued features.
@@ -257,26 +257,26 @@ tree_train(
       </tr>
       <tr>
         <th>tree</th>
-        <td>BYTEA8. Trained decision tree model stored in binary 
+        <td>BYTEA8. Trained decision tree model stored in binary
         format (not human readable).</td>
       </tr>
       <tr>
         <th>cat_levels_in_text</th>
-        <td>TEXT[]. Ordered levels (values) of categorical variables 
-        corresponding to the categorical features in 
+        <td>TEXT[]. Ordered levels (values) of categorical variables
+        corresponding to the categorical features in
         the 'list_of_features' argument above.  Used to help
-        interpret the trained decision tree.  For example, if the 
+        interpret the trained decision tree.  For example, if the
         categorical features specified are <em>weather_outlook</em>
-        and <em>windy</em> in that order, then 'cat_levels_in_text' 
+        and <em>windy</em> in that order, then 'cat_levels_in_text'
         might be <em>[overcast, rain, sunny, False, True]</em>.</td>
       </tr>
       <tr>
         <th>cat_n_levels</th>
         <td>INTEGER[]. Number of levels for each categorical variable.
         Used to help interpret the trained decision tree. In the example
-        from above, 'cat_n_levels' would 
-        be <em>[3, 2]</em> since there are 3 levels 
-        for <em>weather_outlook</em> and 2 levels 
+        from above, 'cat_n_levels' would
+        be <em>[3, 2]</em> since there are 3 levels
+        for <em>weather_outlook</em> and 2 levels
         <em>windy</em>.</td>
       </tr>
 
@@ -406,7 +406,7 @@ tree_train(
     </tr>
    </table>
 
-    A cross-validation table called <em>\<output_table_name\>_cv</em> 
+    A cross-validation table called <em>\<output_table_name\>_cv</em>
     is created if 'n_folds' is set in the 'pruning_params'.
     It has the following columns:
      <table class="output">
@@ -547,7 +547,7 @@ programs.
 
 To export the dot format result to an external file,
 use the method below. Please note that you should use unaligned
-table output mode for psql with '-A' flag, or else you may get an 
+table output mode for psql with '-A' flag, or else you may get an
 error when you try to convert the dot file to another format
 for viewing (e.g., PDF). And inside the psql client,
 both '\\t' and '\\o' should be used:
@@ -578,7 +578,7 @@ of the tree output formats.
 @examp
 <h4>Decision Tree Classification Examples</h4>
 
--# Load input data set related to whether 
+-# Load input data set related to whether
 to play golf or not:
 <pre class="example">
 DROP TABLE IF EXISTS dt_golf CASCADE;
@@ -633,7 +633,7 @@ View the output table (excluding the tree which is in binary format):
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM train_output;
 </pre>
 <pre class="result">
- pruning_cp |        cat_levels_in_text        | cat_n_levels | tree_depth 
+ pruning_cp |        cat_levels_in_text        | cat_n_levels | tree_depth
 ------------+----------------------------------+--------------+------------
           0 | {overcast,rain,sunny,False,True} | {3,2}        |          5
 </pre>
@@ -653,7 +653,7 @@ dependent_varname     | class
 independent_varnames  | "OUTLOOK", temperature, windy
 cat_features          | "OUTLOOK",windy
 con_features          | temperature
-grouping_cols         | 
+grouping_cols         |
 num_all_groups        | 1
 num_failed_groups     | 0
 total_rows_processed  | 14
@@ -663,11 +663,11 @@ dependent_var_type    | text
 input_cp              | 0
 independent_var_types | text, boolean, double precision
 n_folds               | 0
-null_proxy            | 
+null_proxy            |
 </pre>
 
--# Predict output categories.  For the purpose 
-of this example, we use the same data that was 
+-# Predict output categories.  For the purpose
+of this example, we use the same data that was
 used for training:
 <pre class="example">
 \\x off
@@ -676,11 +676,11 @@ SELECT madlib.tree_predict('train_output',          -- tree model
                            'dt_golf',               -- new data table
                            'prediction_results',    -- output table
                            'response');             -- show response
-SELECT g.id, class, estimated_class FROM prediction_results p, 
+SELECT g.id, class, estimated_class FROM prediction_results p,
 dt_golf g WHERE p.id = g.id ORDER BY g.id;
 </pre>
 <pre class="result">
- id |   class    | estimated_class 
+ id |   class    | estimated_class
 ----+------------+-----------------
   1 | Don't Play | Don't Play
   2 | Don't Play | Don't Play
@@ -698,7 +698,7 @@ dt_golf g WHERE p.id = g.id ORDER BY g.id;
  14 | Don't Play | Don't Play
 (14 rows)
 </pre>
-To display the probabilities associated with each 
+To display the probabilities associated with each
 value of the dependent variable, set the 'type' parameter
 to 'prob':
 <pre class="example">
@@ -707,11 +707,11 @@ SELECT madlib.tree_predict('train_output',          -- tree model
                            'dt_golf',               -- new data table
                            'prediction_results',    -- output table
                            'prob');                 -- show probability
-SELECT g.id, class, "estimated_prob_Don't Play",  "estimated_prob_Play" 
+SELECT g.id, class, "estimated_prob_Don't Play",  "estimated_prob_Play"
 FROM prediction_results p, dt_golf g WHERE p.id = g.id ORDER BY g.id;
 </pre>
 <pre class="result">
- id |   class    | estimated_prob_Don't Play | estimated_prob_Play 
+ id |   class    | estimated_prob_Don't Play | estimated_prob_Play
 ----+------------+---------------------------+---------------------
   1 | Don't Play |                         1 |                   0
   2 | Don't Play |                         1 |                   0
@@ -770,7 +770,7 @@ and 9 rows for "Play" in the raw data.
   - If we apply the test of "OUTLOOK" being overcast, then the True (yes) result is
 leaf node 1 which is "Play".  There are 0 "Don't play" rows
 and 4 "Play" rows that correspond to this case (overcast).
-In other words, if it is overcast, you always play golf.  If it is not 
+In other words, if it is overcast, you always play golf.  If it is not
 overcast, you may or may not play golf, depending on the rest
 of the tree.
   - The remaining 5 "Don't play" rows and 5 "Play rows" are then
@@ -813,7 +813,7 @@ SELECT madlib.tree_display('train_output', TRUE);
 &nbsp;} //---end of digraph---------
 </pre>
 One important difference to note about the dot format above is how categorical
-variable tests are displayed:  
+variable tests are displayed:
   - In the text format of the tree, the node 0
 test is "OUTLOOK" in {overcast}, but in the dot format of the tree,
 the same node 0 test reads "\"OUTLOOK\" <= overcast".  This is because
@@ -821,15 +821,15 @@ in dot format for categorical variables, the '<=' symbol
 represents the location in the array 'cat_levels_in_text' from the output
 table for the "OUTLOOK" levels.  The array
 is ['overcast', 'rain', 'sunny', 'False', 'True'] with the first 3 entries
-corresponding to "OUTLOOK" and the last 2 entries corresponding to 'windy'.  So the 
-test "\"OUTLOOK\" <= overcast" means all "OUTLOOK" levels to the 
+corresponding to "OUTLOOK" and the last 2 entries corresponding to 'windy'.  So the
+test "\"OUTLOOK\" <= overcast" means all "OUTLOOK" levels to the
 left of, and including, 'overcast'.  In this case there are no levels
-to the left of 'overcast' in the array so it is simply a test on 
+to the left of 'overcast' in the array so it is simply a test on
 whether it is overcast or not.
-  -  If there was a test "\"OUTLOOK\" <= rain", this would include 
+  -  If there was a test "\"OUTLOOK\" <= rain", this would include
 both 'overcast' and 'rain', since 'overcast' is to the left of 'rain'
 in the array.
-  -  If there was a test "windy <= True", this would include 
+  -  If there was a test "windy <= True", this would include
 both 'False' and 'True', since 'False' is to the left of 'True'
 in the array.
 
@@ -867,12 +867,12 @@ SELECT madlib.tree_display('train_output', TRUE, TRUE);
 </pre>
 The additional information in each node is: impurity, sample size, number of
 weighted rows for each response variable, and classification if the tree was
-pruned at this level.  If your tree is not too big, you may wish to convert the 
-dot format to PDF or another format for better visualization of the 
-tree structure. 
+pruned at this level.  If your tree is not too big, you may wish to convert the
+dot format to PDF or another format for better visualization of the
+tree structure.
 
--# Arrays of features.  Categorical and continuous features 
-can be array columns, in which case the array is expanded to 
+-# Arrays of features.  Categorical and continuous features
+can be array columns, in which case the array is expanded to
 treat each element of the array as a feature:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary;
@@ -897,17 +897,17 @@ View the output table (excluding the tree which is in binary format):
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM train_output;
 </pre>
 <pre class="result">
- pruning_cp |               cat_levels_in_text               | cat_n_levels | tree_depth 
+ pruning_cp |               cat_levels_in_text               | cat_n_levels | tree_depth
 ------------+------------------------------------------------+--------------+------------
           0 | {medium,none,high,low,unhealthy,good,moderate} | {4,3}        |          3
 </pre>
 The first 4 levels correspond to cloud ceiling and the next 3 levels
 correspond to air quality.
 
--# Weighting observations.  Use the 'weights' parameter to 
-adjust a row's vote to balance the dataset.  In our 
-example, the weights are somewhat random but 
-show that a different decision tree is create 
+-# Weighting observations.  Use the 'weights' parameter to
+adjust a row's vote to balance the dataset.  In our
+example, the weights are somewhat random but
+show that a different decision tree is create
 compared to the case where no weights are used:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary;
@@ -924,7 +924,7 @@ SELECT madlib.tree_train('dt_golf',         -- source table
                          3,                 -- min split
                          1,                 -- min bucket
                          10                 -- number of bins per continuous variable
-                         ); 
+                         );
 SELECT madlib.tree_display('train_output');
 </pre>
 <pre class="result">
@@ -937,7 +937,7 @@ SELECT madlib.tree_display('train_output');
 &nbsp; - Number of (weighted) rows for each response variable inside [].'
          The response label order is given as ['"Don\'t Play"', '"Play"'].
          For each leaf, the prediction is given after the '-->'
-&nbsp; ------------------------------------- 
+&nbsp; -------------------------------------
  (0)[17 19]  temperature <= 75
     (1)[ 7 16]  temperature <= 72
        (3)[ 7 10]  temperature <= 70
@@ -953,9 +953,9 @@ SELECT madlib.tree_display('train_output');
 
 <h4>Decision Tree Regression Examples</h4>
 
--# Load input data related to fuel consumption and 10 
-aspects of automobile design and performance for 32 
-automobiles (1973–74 models). Data was extracted from 
+-# Load input data related to fuel consumption and 10
+aspects of automobile design and performance for 32
+automobiles (1973–74 models). Data was extracted from
 the 1974 Motor Trend US magazine.
 <pre class="example">
 DROP TABLE IF EXISTS mt_cars;
@@ -1008,7 +1008,7 @@ INSERT INTO mt_cars VALUES
 (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
 </pre>
 
--# We train a regression decision tree with surrogates 
+-# We train a regression decision tree with surrogates
 in order to handle the NULL feature values:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
@@ -1035,7 +1035,7 @@ which shows ordering of levels of categorical variables 'vs' and 'cyl':
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM train_output;
 </pre>
 <pre class="result">
- pruning_cp | cat_levels_in_text | cat_n_levels | tree_depth 
+ pruning_cp | cat_levels_in_text | cat_n_levels | tree_depth
 ------------+--------------------+--------------+------------
           0 | {0,1,4,6,8}        | {2,3}        |          4
 </pre>
@@ -1055,17 +1055,17 @@ dependent_varname     | mpg
 independent_varnames  | *
 cat_features          | vs,cyl
 con_features          | disp,qsec,wt
-grouping_cols         | 
+grouping_cols         |
 num_all_groups        | 1
 num_failed_groups     | 0
 total_rows_processed  | 32
 total_rows_skipped    | 0
-dependent_var_levels  | 
+dependent_var_levels  |
 dependent_var_type    | double precision
 input_cp              | 0
 independent_var_types | integer, integer, double precision, double precision, double precision
 n_folds               | 0
-null_proxy            | 
+null_proxy            |
 </pre>
 
 -# Predict regression output for the same data and compare with original:
@@ -1076,13 +1076,13 @@ SELECT madlib.tree_predict('train_output',
                            'mt_cars',
                            'prediction_results',
                            'response');
-SELECT s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta 
-FROM prediction_results p, 
+SELECT s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta
+FROM prediction_results p,
 mt_cars s WHERE s.id = p.id ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id | mpg  |  estimated_mpg   |        delta        
+ id | mpg  |  estimated_mpg   |        delta
 ----+------+------------------+---------------------
   1 | 18.7 |            16.84 |                1.86
   2 |   21 | 19.7428571428571 |    1.25714285714286
@@ -1142,13 +1142,13 @@ SELECT madlib.tree_display('train_output', FALSE);
           (13)[10, 15.81]  qsec <= 16.9
              (27)[5, 14.78]  *
              (28)[5, 16.84]  *
-          (14)[4, 13.325]  *                                                                                                    
+          (14)[4, 13.325]  *
  &nbsp;-------------------------------------
 (1 row)
 </pre>
 
--# Display the surrogate variables that are used 
-to compute the split for each node when the primary 
+-# Display the surrogate variables that are used
+to compute the split for each node when the primary
 variable is NULL:
 <pre class="example">
 SELECT madlib.tree_surr_display('train_output');
@@ -1160,17 +1160,17 @@ SELECT madlib.tree_surr_display('train_output');
  (0) cyl in {4}
       1: disp <= 146.7    [common rows = 29]
       2: vs in {1}    [common rows = 26]
-      [Majority branch = 11 ]                                          
+      [Majority branch = 11 ]
  (1) wt <= 2.2
-      [Majority branch = 19 ]                                          
+      [Majority branch = 19 ]
  (2) disp <= 258
       1: cyl in {4,6}    [common rows = 19]
       2: vs in {1}    [common rows = 18]
-      [Majority branch = 7 ]                                          
+      [Majority branch = 7 ]
  (6) qsec <= 17.42
       1: disp > 275.8    [common rows = 11]
       2: vs in {0}    [common rows = 10]
-      [Majority branch = 10 ]                                         
+      [Majority branch = 10 ]
  (13) qsec <= 16.9
       1: wt <= 3.84    [common rows = 8]
       2: disp <= 360    [common rows = 7]
@@ -1178,7 +1178,7 @@ SELECT madlib.tree_surr_display('train_output');
 &nbsp;-------------------------------------
 (1 row)
 </pre>
-@note The 'cyl' parameter in the data set has two tuples with NULL 
+@note The 'cyl' parameter in the data set has two tuples with NULL
 values (<em>id = 9</em> and <em>id = 18</em>).
 In the prediction based on this tree, the surrogate splits for the
 <em>cyl in {4}</em> split in node 0 are used to predict those
@@ -1188,7 +1188,7 @@ the two tuples have non-NULL values for <em>disp</em>, hence the <em>disp <= 146
 split is used to make the prediction. If all the surrogate variables are
 NULL then the majority branch would be followed.
 
--# Now let's use cross validation to select the best 
+-# Now let's use cross validation to select the best
 value of the complexity parameter cp:
 <pre class="example">
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
@@ -1209,13 +1209,13 @@ SELECT madlib.tree_train('mt_cars',         -- source table
                          );
 </pre>
 View the output table (excluding the tree which is in binary format).
-The input cp value was 0 (default) and the best 'pruning_cp' value 
-turns out to be 0 as well in this small example: 
+The input cp value was 0 (default) and the best 'pruning_cp' value
+turns out to be 0 as well in this small example:
 <pre class="example">
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM train_output;
 </pre>
 <pre class="result">
- pruning_cp | cat_levels_in_text | cat_n_levels | tree_depth 
+ pruning_cp | cat_levels_in_text | cat_n_levels | tree_depth
 ------------+--------------------+--------------+------------
           0 | {0,1,4,6,8}        | {2,3}        |          4
 </pre>
@@ -1224,7 +1224,7 @@ The cp values tested and average error and standard deviation are:
 SELECT * FROM train_output_cv ORDER BY cv_error_avg ASC;
 </pre>
 <pre class="result">
-         cp          |   cv_error_avg   | cv_error_stddev  
+         cp          |   cv_error_avg   | cv_error_stddev
 ---------------------+------------------+------------------
                    0 | 4.60222321567406 | 1.14990035501294
  0.00942145242026098 | 4.71906243157825 | 1.21587651168567
@@ -1237,7 +1237,7 @@ SELECT * FROM train_output_cv ORDER BY cv_error_avg ASC;
 
 <h4>NULL Handling Example</h4>
 
--# Create toy example to illustrate 'null-as-category' handling 
+-# Create toy example to illustrate 'null-as-category' handling
 for categorical features:
 <pre class='example'>
 DROP TABLE IF EXISTS null_handling_example;
@@ -1255,7 +1255,7 @@ INSERT INTO null_handling_example VALUES
 (4,'US','NY','rainy','d');
 </pre>
 
--# Train decision tree.  Note that 'NULL' is set as a 
+-# Train decision tree.  Note that 'NULL' is set as a
 valid level for the categorical features country, weather and city:
 <pre class='example'>
 DROP TABLE IF EXISTS train_output, train_output_summary;
@@ -1278,7 +1278,7 @@ SELECT madlib.tree_train('null_handling_example',         -- source table
 SELECT cat_levels_in_text, cat_n_levels FROM train_output;
 </pre>
 <pre class='result'>
-            cat_levels_in_text            | cat_n_levels 
+            cat_levels_in_text            | cat_n_levels
 ------------------------------------------+--------------
  {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2}
 </pre>
@@ -1297,8 +1297,8 @@ id_col_name           | id
 dependent_varname     | response
 independent_varnames  | country, weather, city
 cat_features          | country,weather,city
-con_features          | 
-grouping_cols         | 
+con_features          |
+grouping_cols         |
 num_all_groups        | 1
 num_failed_groups     | 0
 total_rows_processed  | 4
@@ -1311,7 +1311,7 @@ n_folds               | 0
 null_proxy            | __NULL__
 </pre>
 
--# Predict for data not previously seen by assuming NULL 
+-# Predict for data not previously seen by assuming NULL
 value as the default:
 <pre class='example'>
 \\x off
@@ -1339,7 +1339,7 @@ FROM prediction_results p, table_test s
 WHERE s.id = p.id ORDER BY id;
 </pre>
 <pre class='result'>
- id | expected_response | estimated_response 
+ id | expected_response | estimated_response
 ----+-------------------+--------------------
   1 | a                 | a
   2 | b                 | b
@@ -1347,12 +1347,12 @@ WHERE s.id = p.id ORDER BY id;
   4 | d                 | d
 (4 rows)
 </pre>
-There is only training data for country 'US' so the 
-response for country 'IN' is 'a', corresponding to 
-a NULL (not 'US') country level.  Likewise, any 
-city in the 'US' that is not 'NY' will predict 
-response 'b', corresponding to a NULL (not 'NY') 
-city level. 
+There is only training data for country 'US' so the
+response for country 'IN' is 'a', corresponding to
+a NULL (not 'US') country level.  Likewise, any
+city in the 'US' that is not 'NY' will predict
+response 'b', corresponding to a NULL (not 'NY')
+city level.
 
 -# Display the decision tree in basic text format:
 <pre class="example">

http://git-wip-us.apache.org/repos/asf/madlib/blob/26f61e91/src/ports/postgres/modules/utilities/validate_args.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/validate_args.py_in b/src/ports/postgres/modules/utilities/validate_args.py_in
index 1d713a2..4296491 100644
--- a/src/ports/postgres/modules/utilities/validate_args.py_in
+++ b/src/ports/postgres/modules/utilities/validate_args.py_in
@@ -2,7 +2,6 @@ import plpy
 import re
 import string
 
-
 # Postgresql naming restrictions
 """
 Both keywords and identifier names in PostgreSQL have a maximum length limit of
@@ -272,16 +271,24 @@ def drop_tables(table_list):
     if drop_str:
         plpy.execute("DROP TABLE IF EXISTS {0}".format(drop_str))
 
-def table_is_empty(tbl):
+def table_is_empty(tbl, filter_str=None):
     """
     Returns True if the input table has no rows
     """
     if tbl is None or tbl.lower() == 'null':
         plpy.error('Input error: Table name (NULL) is invalid')
-    content = plpy.execute("""SELECT count(*) FROM
-                                (SELECT * FROM {0} LIMIT 1) q1""".
-                           format(tbl))[0]["count"]
-    return not bool(content)   # if content == 0 then True, else False
+    where_clause = "WHERE " + filter_str if filter_str else ''
+    n_valid_tuples = plpy.execute("""
+        SELECT count(*)
+        FROM (
+            -- create a sub-query to get only single tuple
+            SELECT *
+            FROM {0}
+            {1}
+            LIMIT 1
+        ) q
+        """.format(tbl, where_clause))[0]["count"]
+    return (False if n_valid_tuples > 0 else True)
 # -------------------------------------------------------------------------
 
 
@@ -368,8 +375,12 @@ def get_expr_type(expr, tbl):
         SELECT pg_typeof({0}) AS type
         FROM {1}
         LIMIT 1
-        """.format(expr, tbl))[0]['type']
-    return expr_type.lower()
+        """.format(expr, tbl))
+    if not expr_type:
+        plpy.error("Unable to get type of expression ({0}). "
+                   "Table {1} may not contain any valid tuples".
+                   format(expr, tbl))
+    return expr_type[0]['type'].lower()
 # -------------------------------------------------------------------------

[3/3] madlib git commit: Logregr: Report error if output table is empty

Posted by ri...@apache.org.

Logregr: Report error if output table is empty

JIRA MADLIB-1172

When the model cannot be generated due to ill-conditioned input data,
the output table doesn't get populated.  In this case, we report back an
error instead of creating the empty table.

Closes #270


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/abef95ec
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/abef95ec
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/abef95ec

Branch: refs/heads/master
Commit: abef95ec99d2797fa7a51c8d4548d88a656d7364
Parents: ef52d87
Author: Himanshu Pandey <hp...@pivotal.io>
Authored: Thu May 31 18:44:41 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu May 31 18:47:32 2018 -0700

----------------------------------------------------------------------
 .../postgres/modules/regress/logistic.py_in     | 157 +++++++++----------
 .../modules/regress/test/logistic.sql_in        |  69 ++------
 2 files changed, 92 insertions(+), 134 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/abef95ec/src/ports/postgres/modules/regress/logistic.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/logistic.py_in b/src/ports/postgres/modules/regress/logistic.py_in
index 76cbb6a..77ea465 100644
--- a/src/ports/postgres/modules/regress/logistic.py_in
+++ b/src/ports/postgres/modules/regress/logistic.py_in
@@ -153,7 +153,8 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
         plpy.error("Logregr error: Invalid output table name!")
 
     if (table_exists(tbl_output, only_first_schema=True)):
-        plpy.error("Output table name already exists. Drop the table before calling the function.")
+        plpy.error("Output table name already exists. Drop the table before "
+                   "calling the function.")
 
     if not dep_col or dep_col.strip().lower() in ('null', ''):
         plpy.error("Logregr error: Invalid dependent column name!")
@@ -164,7 +165,6 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
     if not ind_col or ind_col.lower() in ('null', ''):
         plpy.error("Logregr error: Invalid independent column name!")
 
-
     if grouping_col is not None:
         if grouping_col == '':
             plpy.error("Logregr error: Invalid grouping columns name!")
@@ -173,14 +173,14 @@ def __logregr_validate_args(schema_madlib, tbl_source, tbl_output, dep_col,
             plpy.error("Logregr error: Grouping column does not exist!")
 
         intersect = frozenset(_string_to_array(grouping_col)).intersection(
-                frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
-                           'p_values', 'odds_ratios', 'condition_no',
-                           'num_processed', 'num_missing_rows_skipped',
-                           'variance_covariance')))
+            frozenset(('coef', 'log_likelihood', 'std_err', 'z_stats',
+                       'p_values', 'odds_ratios', 'condition_no',
+                       'num_processed', 'num_missing_rows_skipped',
+                       'variance_covariance')))
         if len(intersect) > 0:
             plpy.error("Logregr error: Conflicted grouping column name.\n"
                        "Predefined name(s) {0} are not allow!".format(
-                            ', '.join(intersect)))
+                           ', '.join(intersect)))
 
     if max_iter <= 0:
         plpy.error("Logregr error: Maximum number of iterations must be positive!")
@@ -231,12 +231,12 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
             'cg': "__logregr_cg_result",
             'igd': "__logregr_igd_result"}
 
-    plpy.execute("select {schema_madlib}.create_schema_pg_temp()".format(**args))
-    plpy.execute(
-        """
-        drop table if exists pg_temp.{tbl_logregr_args};
-        create table pg_temp.{tbl_logregr_args} as
-            select
+    plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".
+                 format(**args))
+    plpy.execute("""
+        DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
+        CREATE TABLE pg_temp.{tbl_logregr_args} as
+            SELECT
                 {max_iter} as max_iter,
                 {tolerance} as tolerance
         """.format(**args))
@@ -257,7 +257,8 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                                       dep_col, ind_col, optimizer,
                                       grouping_col=grouping_col,
                                       grouping_str=grouping_str,
-                                      col_grp_iteration=args["col_grp_iteration"],
+                                      col_grp_iteration=args[
+                                          "col_grp_iteration"],
                                       col_grp_state=args["col_grp_state"])
 
     grouping_str1 = "" if grouping_col is None else grouping_col + ","
@@ -265,65 +266,61 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
     using_str = "" if grouping_str1 == "" else "using (" + grouping_col + ")"
     join_str = "," if grouping_str1 == "" else "join "
 
-    plpy.execute(
-        """
-        drop table if exists {tbl_output};
-        create table {tbl_output} as
-            select
+    plpy.execute("""
+        DROP TABLE IF EXISTS {tbl_output};
+        CREATE TABLE {tbl_output} as
+            SELECT
                 {grouping_str1}
-                (case when (result).status = 1 then (result).coef
-                    else NULL::double precision[] end) as coef,
-                (case when (result).status = 1 then (result).log_likelihood
-                    else NULL::double precision end) as log_likelihood,
-                (case when (result).status = 1 then (result).std_err
-                    else NULL::double precision[] end) as std_err,
-                (case when (result).status = 1 then (result).z_stats
-                    else NULL::double precision[] end) as z_stats,
-                (case when (result).status = 1 then (result).p_values
-                    else NULL::double precision[] end) as p_values,
-                (case when (result).status = 1 then (result).odds_ratios
-                    else NULL::double precision[] end) as odds_ratios,
-                (case when (result).status = 1 then (result).condition_no
-                    else NULL::double precision end) as condition_no,
-                (case when (result).status = 1 then (result).num_processed
-                    when result is NULL then 0
-                    else NULL::bigint end) as num_rows_processed,
-                (case when (result).status = 1 then num_rows - (result).num_processed
-                    when result is null then num_rows
-                    else NULL::bigint end) as num_missing_rows_skipped,
+                (CASE WHEN (result).status = 1 THEN (result).coef
+                    ELSE NULL::double precision[] END) as coef,
+                (CASE WHEN (result).status = 1 THEN (result).log_likelihood
+                    ELSE NULL::double precision END) as log_likelihood,
+                (CASE WHEN (result).status = 1 THEN (result).std_err
+                    ELSE NULL::double precision[] END) as std_err,
+                (CASE WHEN (result).status = 1 THEN (result).z_stats
+                    ELSE NULL::double precision[] END) as z_stats,
+                (CASE WHEN (result).status = 1 THEN (result).p_values
+                    ELSE NULL::double precision[] END) as p_values,
+                (CASE WHEN (result).status = 1 THEN (result).odds_ratios
+                    ELSE NULL::double precision[] END) as odds_ratios,
+                (CASE WHEN (result).status = 1 THEN (result).condition_no
+                    ELSE NULL::double precision END) as condition_no,
+                (CASE WHEN (result).status = 1 THEN (result).num_processed
+                    when result is NULL THEN 0
+                    ELSE NULL::bigint end) AS num_rows_processed,
+                (CASE WHEN (result).status = 1 THEN num_rows - (result).num_processed
+                    when result is null THEN num_rows
+                    ELSE NULL::bigint end) AS num_missing_rows_skipped,
                 {col_grp_iteration} as num_iterations,
-                (case when (result).status = 1 then (result).vcov
-                    else NULL::double precision[] end) as variance_covariance
-            from
+                (CASE WHEN (result).status = 1 THEN (result).vcov
+                    ELSE NULL::double precision[] END) as variance_covariance
+            FROM
             (
-                select
-                    {col_grp_iteration}, {grouping_str1} result, num_rows
-                from
-                (
-                    (select
+                SELECT {col_grp_iteration}, {grouping_str1} result, num_rows
+                FROM (
+                    (SELECT
                         {grouping_str1}
                         {schema_madlib}.{fnName}({col_grp_state}) as result,
                         {col_grp_iteration}
-                    from
+                    FROM
                         {tbl_logregr_state}
                     ) t
-                    join
-                    (
-                    select
+                    JOIN
+                    (SELECT
                         {grouping_str1}
                         max({col_grp_iteration}) as {col_grp_iteration}
-                    from {tbl_logregr_state}
-                    group by {grouping_str2}
+                    FROM {tbl_logregr_state}
+                    GROUP BY {grouping_str2}
                     ) s
-                    using ({grouping_str1} {col_grp_iteration})
+                    USING ({grouping_str1} {col_grp_iteration})
                 ) q1
                 {join_str}
                 (
-                    select
+                    SELECT
                         {grouping_str1}
                         count(*) num_rows
-                    from {tbl_source}
-                    group by {grouping_str2}
+                    FROM {tbl_source}
+                    GROUP BY {grouping_str2}
                 ) q2
                 {using_str}
             ) q3
@@ -336,21 +333,28 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                    **args))
 
     failed_groups = plpy.execute("""
-                        select count(*) as count
-                        from {tbl_output}
-                        where coef is Null
-                        """.format(**args))[0]["count"]
+        SELECT count(*) as count
+        FROM {tbl_output}
+        WHERE coef IS NULL
+        """.format(tbl_output=tbl_output))[0]["count"]
     all_groups = plpy.execute("""
-                    select count(*) as count
-                    from {tbl_output}
-                    """.format(**args))[0]["count"]
-
-    num_rows = plpy.execute(
-        """
-        select
-            sum(num_rows_processed) as num_rows_processed,
-            sum(num_missing_rows_skipped) as num_missing_rows_skipped
-        from {tbl_output}
+        SELECT count(*) AS count
+        FROM {tbl_output}
+        """.format(**args))[0]["count"]
+
+    # JIRA: MADLIB-1172: When no model can be generated due to ill-conditioned
+    # input data, the output table dosen't get populated. In That
+    # case, report back an error instead of a successful empty table.
+    if failed_groups == all_groups or all_groups == 0:
+        plpy.execute("DROP TABLE IF EXISTS " + str(tbl_output))
+        plpy.error("Logregr error: No model created possibly "
+                   "due to ill-conditioned data.")
+
+    num_rows = plpy.execute("""
+        SELECT
+            sum(num_rows_processed) AS num_rows_processed,
+            sum(num_missing_rows_skipped) AS num_missing_rows_skipped
+        FROM {tbl_output}
         """.format(tbl_output=tbl_output))[0]
 
     if num_rows['num_rows_processed'] is None:
@@ -380,14 +384,9 @@ def __logregr_train_compute(schema_madlib, tbl_source, tbl_output, dep_col,
                    grouping_col="'" + grouping_col + "'" if grouping_col else "NULL",
                    **args))
 
-    # if grouping_col:
-    #     plpy.info(str(all_groups - failed_groups) +
-    #                 " groups succesfully passed, and " +
-    #                 str(failed_groups) + " groups failed")
-
     plpy.execute("""
-                 drop table if exists pg_temp.{tbl_logregr_args};
-                 drop table if exists pg_temp.{tbl_logregr_state}
+                 DROP TABLE IF EXISTS pg_temp.{tbl_logregr_args};
+                 DROP TABLE IF EXISTS pg_temp.{tbl_logregr_state}
                  """.format(**args))
 
     plpy.execute("set client_min_messages to " + old_msg_level)
@@ -517,7 +516,7 @@ SELECT * from patients_logregr;
         help_string = "No such option. Use {schema_madlib}.logregr_train('help')"
 
     return help_string.format(schema_madlib=schema_madlib)
-## ========================================================================
+# ========================================================================
 
 
 def logregr_predict_help(schema_madlib, message, **kwargs):

http://git-wip-us.apache.org/repos/asf/madlib/blob/abef95ec/src/ports/postgres/modules/regress/test/logistic.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/regress/test/logistic.sql_in b/src/ports/postgres/modules/regress/test/logistic.sql_in
index b029404..697dce4 100644
--- a/src/ports/postgres/modules/regress/test/logistic.sql_in
+++ b/src/ports/postgres/modules/regress/test/logistic.sql_in
@@ -720,22 +720,8 @@ select logregr_train(
     0
 );
 
--- Even though we were far more generous for the conjugate-gradient optimizer,
--- but it still generates failures intermittently. We comment it out here
--- to acknowledge CG may generate un-converged results.
--- SELECT
---     assert(relative_error(coef, ARRAY[-3.989979, 0.002264, 0.804038, -0.675443, -1.340204, -1.551464]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong coef'),
---     assert(relative_error(log_likelihood, -229.2587) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong log_likelihood'),
---     assert(relative_error(std_err, ARRAY[1.139951, 0.001094, 0.331819, 0.316490, 0.345306, 0.417832]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong std_err'),
---     assert(relative_error(z_stats, ARRAY[-3.500, 2.070, 2.423, -2.134, -3.881, -3.713]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong z-stats'),
---     assert(relative_error(p_values, ARRAY[0.000465, 0.038465, 0.015388, 0.032829, 0.000104, 0.000205]) < 1, 'Logistic regression with CG optimizer (grad_school): Wrong p-values'),
---     assert(relative_error(odds_ratios, ARRAY[0.0185001, 1.0022670, 2.2345448, 0.5089310, 0.2617923, 0.2119375]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong odds_ratios')
--- FROM temp_result;
-
--- IGD essentially does not work for this case, so we are not testing it
-
--- insert some NULL values for null handling testing
-insert into grad_school (admit, gre, gpa, rank) values
+--- insert some NULL values for null handling testing
+INSERT INTO grad_school (admit, gre, gpa, rank) VALUES
 (NULL, NULL, 3, 4),
 (1, NULL, 3, 5);
 
@@ -748,44 +734,17 @@ select logregr_train(
     'ARRAY[1, gre, gpa]',
     'rank');
 
-DROP TABLE IF EXISTS all_null_patients;
-CREATE TABLE all_null_patients(
-    id INTEGER NOT NULL,
-    second_attack INTEGER,
-    treatment INTEGER,
-    trait_anxiety INTEGER
-);
 
-INSERT INTO all_null_patients(id, second_attack, treatment, trait_anxiety) VALUES
-(0, NULL, 1, 70),
-(0, NULL, 1, 80),
-(0, NULL, 1, 50),
-(0, NULL, 0, 60),
-(0, NULL, 0, 40),
-(0, NULL, 0, 65),
-(0, NULL, 0, 75),
-(0, NULL, 0, 80),
-(0, NULL, 0, 70),
-(0, NULL, 0, 60),
-(1, NULL, 1, 65),
-(1, NULL, 1, 50),
-(1, NULL, 1, 45),
-(1, NULL, 1, 35),
-(1, NULL, 1, 40),
-(1, NULL, 1, 50),
-(1, NULL, 0, 55),
-(1, NULL, 0, 45),
-(1, NULL, 0, 50),
-(1, NULL, 0, 60);
+-- Even though we were far more generous for the conjugate-gradient optimizer,
+-- but it still generates failures intermittently. We comment it out here
+-- to acknowledge CG may generate un-converged results.
+-- SELECT
+--     assert(relative_error(coef, ARRAY[-3.989979, 0.002264, 0.804038, -0.675443, -1.340204, -1.551464]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong coef'),
+--     assert(relative_error(log_likelihood, -229.2587) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong log_likelihood'),
+--     assert(relative_error(std_err, ARRAY[1.139951, 0.001094, 0.331819, 0.316490, 0.345306, 0.417832]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong std_err'),
+--     assert(relative_error(z_stats, ARRAY[-3.500, 2.070, 2.423, -2.134, -3.881, -3.713]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong z-stats'),
+--     assert(relative_error(p_values, ARRAY[0.000465, 0.038465, 0.015388, 0.032829, 0.000104, 0.000205]) < 1, 'Logistic regression with CG optimizer (grad_school): Wrong p-values'),
+--     assert(relative_error(odds_ratios, ARRAY[0.0185001, 1.0022670, 2.2345448, 0.5089310, 0.2617923, 0.2119375]) < 0.1, 'Logistic regression with CG optimizer (grad_school): Wrong odds_ratios')
+-- FROM temp_result;
 
-drop table if exists temp_result;
-drop table if exists temp_result_summary;
-select logregr_train(
-    'all_null_patients',
-    'temp_result',
-    'second_attack',
-    'ARRAY[1, treatment, trait_anxiety]',
-    'id',
-    20,
-    'cg'
-);
+-- IGD essentially does not work for this case, so we are not testing it

[2/3] madlib git commit: DT: Ensure summary table has correct features

Posted by ri...@apache.org.

DT: Ensure summary table has correct features

JIRA: MADLIB-1236

If a cat_feature is dropped (due to just a single level), that feature
should not be included in the summary table list, since tree_predict
uses the features in summary table while reading source table. This
commit ensures the right features are populated in the summary table.

Closes #268


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/ef52d871
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/ef52d871
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/ef52d871

Branch: refs/heads/master
Commit: ef52d87198d73db272ef033f5c7c0f26b2956a0b
Parents: 26f61e9
Author: Rahul Iyer <ri...@apache.org>
Authored: Thu May 3 11:38:27 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu May 31 17:03:37 2018 -0700

----------------------------------------------------------------------
 .../recursive_partitioning/decision_tree.py_in  | 51 ++++++++++++--------
 1 file changed, 30 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/ef52d871/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 48b8fab..04fde7e 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -56,11 +56,6 @@ def _tree_validate_args(
             "Decision tree error: Invalid data table.")
     _assert(table_exists(training_table_name),
             "Decision tree error: Data table is missing.")
-    _assert(not table_is_empty(training_table_name,
-                               _get_filter_str(dependent_variable, grouping_cols)),
-            "Decision tree error: Data table ({0}) is empty "
-            "(after filtering invalid tuples)".
-            format(training_table_name))
 
     _assert(not table_exists(output_table_name, only_first_schema=True),
             "Decision tree error: Output table already exists.")
@@ -95,6 +90,12 @@ def _tree_validate_args(
     _assert(max_depth >= 0 and max_depth < 100,
             "Decision tree error: maximum tree depth must be positive and less than 100.")
 
+    _assert(not table_is_empty(training_table_name,
+                               _get_filter_str(dependent_variable, grouping_cols)),
+            "Decision tree error: Data table ({0}) is empty "
+            "(after filtering invalid tuples)".
+            format(training_table_name))
+
     _assert(cp >= 0, "Decision tree error: cp must be non-negative.")
     _assert(min_split > 0, "Decision tree error: min_split must be positive.")
     _assert(min_bucket > 0, "Decision tree error: min_bucket must be positive.")
@@ -510,8 +511,7 @@ def tree_train(schema_madlib, training_table_name, output_table_name,
 
 
 def _create_output_tables(schema_madlib, training_table_name, output_table_name,
-                          tree_states, bins,
-                          split_criterion, cat_features, con_features,
+                          tree_states, bins, split_criterion,
                           id_col_name, dependent_variable, list_of_features,
                           is_classification, n_all_rows, n_rows, dep_list, cp,
                           all_cols_types, grouping_cols=None,
@@ -519,19 +519,19 @@ def _create_output_tables(schema_madlib, training_table_name, output_table_name,
                           n_folds=0, null_proxy=None, **kwargs):
     if not grouping_cols:
         _create_result_table(schema_madlib, tree_states[0],
-                             bins['cat_origin'], bins['cat_n'], cat_features,
-                             con_features, output_table_name,
+                             bins['cat_origin'], bins['cat_n'], bins['cat_features'],
+                             bins['con_features'], output_table_name,
                              use_existing_tables, running_cv, n_folds)
     else:
         _create_grp_result_table(
-            schema_madlib, tree_states, bins, cat_features,
-            con_features, output_table_name, grouping_cols, training_table_name,
-            use_existing_tables, running_cv, n_folds)
+            schema_madlib, tree_states, bins, bins['cat_features'],
+            bins['con_features'], output_table_name, grouping_cols,
+            training_table_name, use_existing_tables, running_cv, n_folds)
 
     failed_groups = sum(row['finished'] != 1 for row in tree_states)
     _create_summary_table(
         schema_madlib, split_criterion, training_table_name,
-        output_table_name, id_col_name, cat_features, con_features,
+        output_table_name, id_col_name, bins['cat_features'], bins['con_features'],
         dependent_variable, list_of_features,
         failed_groups, is_classification, n_all_rows,
         n_rows, dep_list, all_cols_types, cp, grouping_cols, 1,
@@ -747,6 +747,7 @@ def _get_bins(schema_madlib, training_table_name,
 
         return dict(
             con=con_splits['con_splits'],
+            con_features=con_features,
             cat_origin=[level for col in cat_features
                         for level in all_levels[col_to_row[col]]['levels']],
             cat_n=[len(all_levels[col_to_row[col]]['levels'])
@@ -756,6 +757,7 @@ def _get_bins(schema_madlib, training_table_name,
         # categorical part is empty
         return dict(
             con=con_splits['con_splits'],
+            con_features=con_features,
             cat_origin=[],
             cat_n=[],
             cat_features=[])
@@ -1023,6 +1025,7 @@ def _get_bins_grps(
         con = [''] * len(grp_key_con)
 
     return dict(con=con,
+                con_features=con_features,
                 grp_key_con=grp_key_con,
                 cat_origin=cat_origin,
                 cat_n=cat_n,
@@ -1477,6 +1480,8 @@ def _create_summary_table(
         dep_list, all_cols_types, cp, grouping_cols=None, n_groups=1,
         use_existing_tables=False, n_folds=0, null_proxy=None):
 
+    output_table_summary = add_postfix(output_table_name, "_summary")
+
     # dependent variables
     if dep_list:
         dep_list_str = ("$dep_list$" +
@@ -1500,11 +1505,10 @@ def _create_summary_table(
         cp_str = str(cp) + "::double precision"
 
     if use_existing_tables:
-        header = "INSERT INTO {0} ".format(
-            add_postfix(output_table_name, "_summary"))
+        plpy.execute("TRUNCATE " + output_table_summary)
+        header = "INSERT INTO {0} ".format(output_table_summary)
     else:
-        header = "CREATE TABLE {0} AS ".format(
-            add_postfix(output_table_name, "_summary"))
+        header = "CREATE TABLE {0} AS ".format(output_table_summary)
     null_proxy_str="NULL" if null_proxy is None else "'{0}'".format(null_proxy)
     sql = header + """
             SELECT
@@ -1614,7 +1618,8 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
     """
     _validate_predict(schema_madlib, model, source, output, use_existing_tables)
     model_summary = add_postfix(model, "_summary")
-    # obtain the cat_features and con_features from model table
+
+    # obtain the cat_features and con_features from model summary table
     summary_elements = plpy.execute("SELECT * FROM {0}".format(model_summary))[0]
 
     list_of_features = split_quoted_delimited_str(summary_elements["independent_varnames"])
@@ -1624,12 +1629,13 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
             "Decision tree error: Missing columns in predict data table ({0}) "
             "that were used during training".format(source))
     id_col_name = summary_elements["id_col_name"]
-    grouping_cols_str = summary_elements.get("grouping_cols")  # optional, default = None
     dep_varname = summary_elements["dependent_varname"]
     dep_levels = summary_elements["dependent_var_levels"]
     is_classification = summary_elements["is_classification"]
     dep_type = summary_elements['dependent_var_type']
-    null_proxy = summary_elements.get('null_proxy')  # optional, default = None
+    # optional variables, default value is None
+    grouping_cols_str = summary_elements.get("grouping_cols")
+    null_proxy = summary_elements.get('null_proxy')
 
     # find which columns are of type boolean
     boolean_cats = set([key for key, value in get_cols_and_types(source)
@@ -1664,7 +1670,10 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response',
                         tree,
                         {cat_features_str}::INTEGER[],
                         {con_features_str}::DOUBLE PRECISION[]) as {pred_name}
-            FROM {source} as s {join_str} {model} as m {using_str}
+            FROM {source} as s
+                 {join_str}
+                 {model} as m
+                 {using_str}
             {use_fold}
             """
     else: