You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/04/01 03:21:30 UTC
[11/11] incubator-madlib git commit: Build: Add support for HAWQ 2.0

Build: Add support for HAWQ 2.0

JIRA: MADLIB-960

This commit adds support for HAWQ 2.0. Changes have been made to
how the m4_ifdef works for HAWQ. The m4 change was made to simplify
the system and allow different code paths for the different DB
versions.

Finally, the older SVD and CART methods (decision tree and random forest) have
been removed. These functions have been deprecated for a long(!) time and lead
to issues when compiled with HAWQ 2.0.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/96f9ac04
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/96f9ac04
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/96f9ac04

Branch: refs/heads/master
Commit: 96f9ac042857914aa0e96aed6867741b5fb6b3c4
Parents: 35d6561
Author: Rahul Iyer <ri...@pivotal.io>
Authored: Mon Mar 28 12:00:37 2016 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Thu Mar 31 18:15:05 2016 -0700

----------------------------------------------------------------------
 methods/cart/src/pg_gp/c45.sql_in               | 1435 ------
 methods/cart/src/pg_gp/dt.c                     | 2751 -----------
 methods/cart/src/pg_gp/dt.sql_in                | 4299 ------------------
 methods/cart/src/pg_gp/dt_preproc.sql_in        | 1903 --------
 methods/cart/src/pg_gp/dt_utility.sql_in        |  583 ---
 methods/cart/src/pg_gp/rf.sql_in                | 1269 ------
 methods/cart/src/pg_gp/sql/dt_test.sql_in       |  913 ----
 .../cart/src/pg_gp/sql/dt_utility_test.sql_in   |   67 -
 methods/cart/src/pg_gp/sql/rf_test.sql_in       |  631 ---
 src/config/Modules.yml                          |    2 -
 src/madpack/madpack.py                          |  724 ++-
 src/madpack/upgrade_util.py                     |   17 +-
 src/modules/linalg/metric.cpp                   |    4 +-
 src/modules/linalg/metric.hpp                   |    4 +-
 src/ports/hawq/2.0/CMakeLists.txt               |    5 +
 src/ports/hawq/2.0/config/CMakeLists.txt        |   19 +
 src/ports/hawq/2.0/config/Modules.yml           |   51 +
 src/ports/hawq/cmake/FindHAWQ_2_0.cmake         |    2 +
 src/ports/hawq/cmake/HAWQUtils.cmake            |    6 +
 .../modules/assoc_rules/test/assoc_rules.sql_in |   10 +-
 src/ports/postgres/modules/kmeans/kmeans.py_in  |  144 +-
 src/ports/postgres/modules/kmeans/kmeans.sql_in |    8 +-
 src/ports/postgres/modules/linalg/linalg.sql_in |   50 +-
 .../postgres/modules/svd_mf/__init__.py_in      |    0
 src/ports/postgres/modules/svd_mf/svdmf.py_in   |  258 --
 src/ports/postgres/modules/svd_mf/svdmf.sql_in  |  199 -
 .../postgres/modules/svd_mf/test/svdmf.sql_in   |   79 -
 .../postgres/modules/utilities/control.py_in    |  289 +-
 .../modules/utilities/control_composite.py_in   |  219 +-
 .../modules/utilities/group_control.py_in       |  508 +--
 .../utilities/in_mem_group_control.py_in        |   14 +-
 .../postgres/modules/utilities/utilities.py_in  |    8 +-
 .../postgres/modules/utilities/utilities.sql_in |    4 +-
 33 files changed, 1036 insertions(+), 15439 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/96f9ac04/methods/cart/src/pg_gp/c45.sql_in
----------------------------------------------------------------------
diff --git a/methods/cart/src/pg_gp/c45.sql_in b/methods/cart/src/pg_gp/c45.sql_in
deleted file mode 100644
index f013278..0000000
--- a/methods/cart/src/pg_gp/c45.sql_in
+++ /dev/null
@@ -1,1435 +0,0 @@
-/* ----------------------------------------------------------------------- *//**
- *
- * @file c45.sql_in
- *
- * @brief C4.5 APIs and main controller written in PL/PGSQL
- * @date April 5, 2012
- *
- * @sa For a brief introduction to decision trees, see the
- *     module description \ref grp_dectree.
- *
- *//* ----------------------------------------------------------------------- */
-
-m4_include(`SQLCommon.m4')
-
-/**
-@addtogroup grp_dectree
-
-<div class="toc"><b>Contents</b>
-<ul>
-<li><a href="#input">Input</a></li>
-<li><a href="#train">Training Function</a></li>
-<li><a href="#classify">Classification Function</a></li>
-<li><a href="#score">Scoring Function</a></li>
-<li><a href="#display">Display Tree Function</a></li>
-<li><a href="#notes">Implementation Notes</a></li>
-<li><a href="#examples">Examples</a></li>
-<li><a href="#related">Related Topics</a></li>
-</ul>
-</div>
-
-@brief Generates a decision tree using the C4.5 algorithm.
-
-\warning <em> This is an old implementation of decision trees.
-For a newer implementation, please see \ref grp_decision_tree</em>
-
-This module provides an implementation of the C4.5 algorithm to
-grow decision trees.
-
-The implementation supports:
-- Building the decision tree
-- Multiple split critera, including:
-  -- Information Gain
-  -- Gini Coefficient
-  -- Gain Ratio
-- Decision tree Pruning
-- Decision tree classification/scoring
-- Decision tree display
-- Rule generation
-- Continuous and discrete features
-- Missing value handling
-
-@anchor train
-@par Training Function
-
-Run the training algorithm on the source data:
-<pre class="syntax">
-c45_train( split_criterion,
-           training_table_name,
-           result_tree_table_name,
-           validation_table_name,
-           continuous_feature_names,
-           feature_col_names,
-           id_col_name,
-           class_col_name,
-           confidence_level,
-           how2handle_missing_value,
-           max_tree_depth,
-           node_prune_threshold,
-           node_split_threshold,
-           verbosity
-         )
-</pre>
-\b Arguments
-<dl class="arglist">
-  <dt>split_criterion</dt>
-  <dd>The name of the split criterion that should be used
-    for tree construction. The valid values are
-    ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
-    Information gain(infogain) and gini index(gini) are biased
-    toward multivalued attributes. Gain ratio(gainratio) adjusts
-    for this bias. However, it tends to prefer unbalanced splits
-    in which one partition is much smaller than the others.
-  </dd>
-    <dt>training_table_name</dt>
-    <dd>The name of the table/view with the source data.
-The <b>training data</b> is expected to be of the following form:
-<pre>{TABLE|VIEW} <em>trainingSource</em> (
-    ...
-    <em>id</em> INT|BIGINT,
-    <em>feature1</em> SUPPORTED_DATA_TYPE,
-    <em>feature2</em> SUPPORTED_DATA_TYPE,
-    <em>feature3</em> SUPPORTED_DATA_TYPE,
-    ....................
-    <em>featureN</em> SUPPORTED_DATA_TYPE,
-    <em>class</em>    SUPPORTED_DATA_TYPE,
-    ...
-)</pre>
-The detailed list of SUPPORTED_DATA_TYPE is:
-SMALLINT, INT, BIGINT, FLOAT8, REAL,
-DECIMAL, INET, CIDR, MACADDR, BOOLEAN,
-CHAR, VARCHAR, TEXT, "char",
-DATE, TIME, TIMETZ, TIMESTAMP, TIMESTAMPTZ, and INTERVAL.
-    </dd>
-    <dt>result_tree_table_name</dt>
-    <dd>The name of the table to contain the decision tree output. The table stores an abstract object (representing the model) used for further classification. It has the following columns:
-      <table class="output">
-        <tr>
-         <th>id</th>
-       </tr>
-       <tr>
-         <th>tree_location</th>
-       </tr>
-       <tr>
-         <th>feature</th>
-       </tr>
-       <tr>
-         <th>probability</th>
-       </tr>
-       <tr>
-         <th>ebp_coeff</th>
-       </tr>
-       <tr>
-         <th>maxclass</th>
-       </tr>
-       <tr>
-         <th>scv</th>
-       </tr>
-       <tr>
-         <th>live</th>
-       </tr>
-       <tr>
-         <th>sample_size</th>
-       </tr>
-       <tr>
-         <th>parent_id</th>
-       </tr>
-       <tr>
-         <th>lmc_nid</th>
-       </tr>
-       <tr>
-         <th>lmc_fval</th>
-       </tr>
-       <tr>
-         <th>is_continuous</th>
-       </tr>
-       <tr>
-         <th>split_value</th>
-       </tr>
-       <tr>
-         <th>tid</th>
-       </tr>
-       <tr>
-         <th>dp_ids</th>
-         </tr>
-       </table>
-     </dd>
-     <dt>validation_table_name</dt>
-     <dd>
-      The name of the table/view that contains the validation set used for
-      tree pruning. The default is NULL, in which case we will not do tree
-      pruning.
-    </dd>
-    <dt>continuous_feature_names</dt>
-    <dd>
-      A comma-separated list of the names of features whose values are continuous. The default is null, which means there are no continuous features in the training table.
-    </dd>
-    <dt>feature_col_names</dt>
-    <dd>A comma-separated list of the names of table columns, each of which defines a feature. The default value is null, which means all the columns in the training table, except columns named ‘id’ and ‘class’, will be used as features.
-    </dd>
-    <dt>id_col_name</dt>
-    <dd>The name of the column containing an ID for each record.</dd>
-    <dt>class_col_name</dt>
-    <dd>The name of the column containing the labeled class. </dd>
-    <dt>confidence_level</dt>
-    <dd>A statistical confidence interval of the  resubstitution error.</dd> 
-    <dt>how2handle_missing_value</dt> 
-    <dd>The way to handle missing value. The valid value is 'explicit' or 'ignore'.</dd>
-    <dt>max_tree_depth</dt>
-    <dd>Specifies the maximum number of levels in the result DT to avoid overgrown DTs. </dd>
-    <dt>node_prune_threshold</dt>
-    <dd>The minimum percentage of the number of records required in a
-    child node. It can't be NULL. The range of it is in [0.0, 1.0].
-    This threshold only applies to the non-root nodes. Therefore,
-    if its value is 1, then the trained tree only has one node (the root node);
-    if its value is 0, then no nodes will be pruned by this parameter.</dd>
-    <dt>node_split_threshold</dt>
-    <dd>The minimum percentage of the number of records required in a node in order for a further split to be possible. It can't be NULL. The range of it is in [0.0, 1.0]. If it's value is 1, then the trained tree only has two levels, since only the root node can grow; if its value is 0, then trees can grow extensively.</dd>
-  <dt>verbosity</dt>
-  <dd>An integer greater than 0 means this function runs in verbose mode.</dd>
-</dl>
-
-@anchor classify
-@par Classification Function
-
-The classification function uses the learned model stored by the training function to create the classification results.
-<pre class="syntax">
-c45_classify( tree_table_name,
-              classification_table_name,
-              result_table_name
-            )
-</pre>
-\b Arguments
-<dl class="arglist">
-  <dt>tree_table_name</dt>
-  <dd>The name of the table containing the trained model.
-
-    The data to classify is expected to be in the same form as the training data, except that it does not need a class column. </dd>
-  <dt>classification_table_name</dt>
-  <dd>The name of the table containing the data to classify.</dd>
-  <dt>result_table_name</dt>
-  <dd>The name of the output table.</dd>
-</dl>
-
-@anchor score
-@par Scoring Function
-The scoring function scores the learned model against a validation data set.
-<pre class="syntax">
-c45_score( tree_table_name,
-           validation_table_name,
-           verbosity
-         );
-</pre>
-
-This gives a ratio of correctly classified items in the validation set.
-
-@anchor display
-@par Display Tree Function
-
-The display tree function displays the learned model in a human-readable format.
-
-<pre class="syntax">
-c45_display( tree_table_name
-           );
-</pre>
-
-@anchor clean
-@par Clean Tree Function
-
-The clean tree function cleans up the learned model and all metadata.
-<pre class="syntax">
-  c45_clean( tree_table_name
-           );
-</pre>
-
-@anchor notes
-@par Implementation Notes
-
-Due to some implementation difference, decisiont tree on HAWQ is much
-slower than on Greenplum database when running on small data
-sets. However, for larger data sets, the performance difference is
-much smaller. For example, in a test with 0.75 million rows of data,
-decision tree on HAWQ is only one time slower than on GPDB. This is
-because the overhead due to the different implementation is
-proportional to the tree size, and is usually negligible as data size
-increases (The tree size is not likely to increase proportionally with
-the data size. For example, if a 10-node tree is used to fit a data
-set with 1000 rows, it is very unlikely to fit another data set with 1
-million rows with a 10000-node tree).
-
-@anchor examples
-@examp
-
--# Prepare an input table.
-<pre class="example">
-SELECT * FROM golf_data ORDER BY id;
-</pre>
-Result:
-<pre class="result">
- id | outlook  | temperature | humidity | windy  |    class
-&nbsp;---+----------+-------------+----------+--------+--------------
-  1 | sunny    |          85 |       85 |  false |  Do not Play
-  2 | sunny    |          80 |       90 |  true  |  Do not Play
-  3 | overcast |          83 |       78 |  false |  Play
-  4 | rain     |          70 |       96 |  false |  Play
-  5 | rain     |          68 |       80 |  false |  Play
-  6 | rain     |          65 |       70 |  true  |  Do not Play
-  7 | overcast |          64 |       65 |  true  |  Play
-  8 | sunny    |          72 |       95 |  false |  Do not Play
-  9 | sunny    |          69 |       70 |  false |  Play
- 10 | rain     |          75 |       80 |  false |  Play
- 11 | sunny    |          75 |       70 |  true  |  Play
- 12 | overcast |          72 |       90 |  true  |  Play
- 13 | overcast |          81 |       75 |  false |  Play
- 14 | rain     |          71 |       80 |  true  |  Do not Play
-(14 rows)
-</pre>
-
--# Train the decision tree model. Run the c45_clean() function first to clean up any model and metadata from previous executions.
-<pre class="example">
-SELECT * FROM madlib.c45_clean( 'trained_tree_infogain'
-                              );
-SELECT * FROM madlib.c45_train( 'infogain',
-                                'golf_data',
-                                'trained_tree_infogain',
-                                null,
-                                'temperature,humidity',
-                                'outlook,temperature,humidity,windy',
-                                'id',
-                                'class',
-                                100,
-                                'explicit',
-                                5,
-                                0.001,
-                                0.001,
-                                0
-                              );
-</pre>
-Result:
-<pre class="result">
- training_set_size | tree_nodes | tree_depth |  training_time  | split_criterion
-&nbsp;------------------+------------+------------+-----------------+-----------------
-                14 |          8 |          3 | 00:00:00.871805 | infogain
-(1 row)
-</pre>
-
--# View the the tree model table.
-<pre class="example">
-SELECT * FROM trained_tree_infogain ORDER BY id;
-</pre>
-Result:
-<pre class="result">
- id | tree_location | feature |    probability    | ebp_coeff | maxclass |       scv         | live |sample_size | parent_id | lmc_nid | lmc_fval | is_continuous   | split_value
-&nbsp;---+---------------+---------+-------------------+-----------+----------+-------------------+------+----------+-----------+---------+----------+-----------------+-------------
-  1 | {0}           |       3 | 0.642857142857143 |         1 |        2 | 0.171033941880327 |    0 |       14 |         0 |       2 |        1 | f               |
-  2 | {0,1}         |       4 |                 1 |         1 |        2 |                 0 |    0 |        4 |         1 |         |          | f               |
-  3 | {0,2}         |       4 |               0.6 |         1 |        2 | 0.673011667009257 |    0 |        5 |         1 |       5 |        1 | f               |
-  4 | {0,3}         |       2 |               0.6 |         1 |        1 | 0.673011667009257 |    0 |        5 |         1 |       7 |        1 | t               |          70
-  5 | {0,2,1}       |       4 |                 1 |         1 |        2 |                 0 |    0 |        3 |         3 |         |          | f               |
-  6 | {0,2,2}       |       4 |                 1 |         1 |        1 |                 0 |    0 |        2 |         3 |         |          | f               |
-  7 | {0,3,1}       |       4 |                 1 |         1 |        2 |                 0 |    0 |        2 |         4 |         |          | f               |
-  8 | {0,3,2}       |       4 |                 1 |         1 |        1 |                 0 |    0 |        3 |         4 |         |          | f               |
-(8 rows)
-</pre>
-
--# Display the tree with a human readable format:
-<pre class="example">
-SELECT madlib.c45_display('trained_tree_infogain');
-</pre>
-Result:
-<pre class="result">
-                                      c45_display
-&nbsp;--------------------------------------------------------------------------------------
-Tree 1
-    Root Node  : class(  Play)   num_elements(14)  predict_prob(0.642857142857143)
-         outlook:  = overcast : class( Play)   num_elements(4)  predict_prob(1)
-         outlook:  = rain : class( Play)   num_elements(5)  predict_prob(0.6)
-             windy:  =  false : class( Play)   num_elements(3)  predict_prob(1)
-             windy:  = true  : class(  Do not Play)   num_elements(2)  predict_prob(1)
-         outlook:  =  sunny      : class(  Do not Play)   num_elements(5)  predict_prob(0.6)
-             humidity:  <= 70 : class( Play)   num_elements(2)  predict_prob(1)
-             humidity:  > 70  : class(  Do not Play)   num_elements(3)  predict_prob(1)
-(1 row)
-</pre>
-
--# Classify some data with the learned model.
-<pre class="example">
-SELECT * FROM madlib.c45_classify ( 'trained_tree_infogain',
-                                    'golf_data',
-                                    'classification_result'
-                                  );
-</pre>
-Result:
-<pre class="result">
- input_set_size |    classification_time
-----------------+-----------------
-             14 | 00:00:00.247713
-(1 row)
-</pre>
-
--# Check the classification results.
-<pre class="example">
-SELECT t.id, t.outlook, t.temperature, t.humidity, t.windy, c.class
-FROM   madlib.classification_result c, golf_data t
-WHERE  t.id=c.id ORDER BY id;
-</pre>
-Result:
-<pre class="result">
- id | outlook  | temperature | humidity | windy  |    class
-&nbsp;---+----------+-------------+----------+--------+--------------
-  1 | sunny    |          85 |       85 |  false |  Do not Play
-  2 | sunny    |          80 |       90 |  true  |  Do not Play
-  3 | overcast |          83 |       78 |  false |  Play
-  4 | rain     |          70 |       96 |  false |  Play
-  5 | rain     |          68 |       80 |  false |  Play
-  6 | rain     |          65 |       70 |  true  |  Do not Play
-  7 | overcast |          64 |       65 |  true  |  Play
-  8 | sunny    |          72 |       95 |  false |  Do not Play
-  9 | sunny    |          69 |       70 |  false |  Play
- 10 | rain     |          75 |       80 |  false |  Play
- 11 | sunny    |          75 |       70 |  true  |  Play
- 12 | overcast |          72 |       90 |  true  |  Play
- 13 | overcast |          81 |       75 |  false |  Play
- 14 | rain     |          71 |       80 |  true  |  Do not Play
-(14 rows)
-</pre>
-
--# Score the data against a validation set.
-<pre class="example">
-SELECT * FROM madlib.c45_score( 'trained_tree_infogain',
-                                'golf_data_validation',
-                                0)
-                              );
-</pre>
-Result:
-<pre class="result">
- c45_score
-&nbsp;----------
-      1
-(1 row)
-</pre>
-
--# Clean up the tree and metadata.
-<pre class="example">
-SELECT madlib.c45_clean( 'trained_tree_infogain'
-                       );
-</pre>
-Result:
-<pre class="result">
- c45_clean
-&nbsp;----------
-&nbsp;
-(1 row)
-</pre>
-
-@anchor literature
-@literature
-
-[1] http://en.wikipedia.org/wiki/C4.5_algorithm
-
-@anchor related
-@par Related Topics
-File c45.sql_in documenting the SQL functions.
-*/
-
-/*
- * This structure is used to store the result for the function of c45_train.
- *
- * training_set_size   The number of rows in the training set.
- * tree_nodes          The number of total tree nodes.
- * tree_depth          The depth of the trained tree.
- * training_time       The time consumed during training the tree.
- * split_criterion     The split criterion used to train the tree.
- *
- */
-DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_train_result CASCADE;
-CREATE TYPE MADLIB_SCHEMA.c45_train_result AS
-    (
-    training_set_size        BIGINT,
-    tree_nodes               BIGINT,
-    tree_depth               INT,
-    training_time            INTERVAL,
-    split_criterion          TEXT
-    );
-
-
-/*
- * This structure is used to store the result for the function of c45_classify.
- *
- * input_set_size         The number of rows in the classification set.
- * classification_time    The time consumed during classifying the tree.
- *
- */
-DROP TYPE IF EXISTS MADLIB_SCHEMA.c45_classify_result CASCADE;
-CREATE TYPE MADLIB_SCHEMA.c45_classify_result AS
-    (
-    input_set_size        BIGINT,
-    classification_time   INTERVAL
-    );
-
-/**
- * @brief This is the long form API of training tree with all specified parameters.
- *
- * @param split_criterion           The name of the split criterion that should be used
- *                                  for tree construction. The valid values are
- *                                  ‘infogain’, ‘gainratio’, and ‘gini’. It can't be NULL.
- *                                  Information gain(infogain) and gini index(gini) are biased
- *                                  toward multivalued attributes. Gain ratio(gainratio) adjusts
- *                                  for this bias. However, it tends to prefer unbalanced splits
- *                                  in which one partition is much smaller than the others.
- * @param training_table_name       The name of the table/view with the source data.
- * @param result_tree_table_name    The name of the table where the resulting DT
- *                                  will be kept.
- * @param validation_table_name     The name of the table/view that contains the validation
- *                                  set used for tree pruning. The default is NULL, in which
- *                                  case we will not do tree pruning.
- * @param continuous_feature_names  A comma-separated list of the names of features whose values
- *                                  are continuous. The default is null, which means there are
- *                                  no continuous features in the training table.
- * @param feature_col_names         A comma-separated list of the names of table columns, each of
- *                                  which defines a feature. The default value is null, which means
- *                                  all the columns in the training table, except columns named
- *                                   ‘id’ and ‘class’, will be used as features.
- * @param id_col_name               The name of the column containing an ID for each record.
- * @param class_col_name            The name of the column containing the labeled class.
- * @param confidence_level          A statistical confidence interval of the
- *                                  resubstitution error.
- * @param how2handle_missing_value  The way to handle missing value. The valid value
- *                                  is 'explicit' or 'ignore'.
- * @param max_tree_depth            Specifies the maximum number of levels in the result DT
- *                                  to avoid overgrown DTs.
- * @param node_prune_threshold      The minimum percentage of the number of records required in a
- *                                  child node. It can't be NULL. The range of it is in [0.0, 1.0].
- *                                  This threshold only applies to the non-root nodes. Therefore,
- *                                  if its value is 1, then the trained tree only has one node (the root node);
- *                                  if its value is 0, then no nodes will be pruned by this parameter.
- * @param node_split_threshold      The minimum percentage of the number of records required in a
- *                                  node in order for a further split to be possible.
- *                                  It can't be NULL. The range of it is in [0.0, 1.0].
- *                                  If it's value is 1, then the trained tree only has two levels, since
- *                                  only the root node can grow; if its value is 0, then trees can grow
- *                                  extensively.
- * @param verbosity                 > 0 means this function runs in verbose mode.
- *
- * @return An c45_train_result object.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
-    (
-    split_criterion             TEXT,
-    training_table_name         TEXT,
-    result_tree_table_name      TEXT,
-    validation_table_name       TEXT,
-    continuous_feature_names    TEXT,
-    feature_col_names           TEXT,
-    id_col_name                 TEXT,
-    class_col_name              TEXT,
-    confidence_level            FLOAT,
-    how2handle_missing_value    TEXT,
-    max_tree_depth              INT,
-    node_prune_threshold        FLOAT,
-    node_split_threshold        FLOAT,
-    verbosity                   INT
-    )
-RETURNS MADLIB_SCHEMA.c45_train_result AS $$
-DECLARE
-    begin_func_exec                 TIMESTAMP;
-    tree_table_name                 TEXT;
-    ret                             MADLIB_SCHEMA.c45_train_result;
-    train_rs                        RECORD;
-    old_optimizer                   TEXT;
-BEGIN
-    m4_changequote(`<!', `!>')
-    m4_ifdef(<!__HAWQ__!>, <!
-    EXECUTE 'SHOW optimizer' into old_optimizer;
-    EXECUTE 'SET optimizer = off';
-    !>)
-    m4_changequote(<!`!>, <!'!>)
-
-    begin_func_exec = clock_timestamp();
-
-    IF (verbosity < 1) THEN
-        -- get rid of the messages whose severity level is lower than 'WARNING'
-        SET client_min_messages = WARNING;
-    END IF;
-
-    PERFORM MADLIB_SCHEMA.__assert
-        (
-            (confidence_level IS NOT NULL)      AND
-            float8ge(confidence_level, 0.001)   AND
-            float8le(confidence_level, 100),
-            'confidence level value must be in range from 0.001 to 100'
-        );
-
-    PERFORM MADLIB_SCHEMA.__assert
-        (
-            validation_table_name IS NULL OR
-            MADLIB_SCHEMA.__table_exists
-                (
-                    validation_table_name
-                ),
-            'the specified validation table' ||
-            '<'                              ||
-            validation_table_name            ||
-            '> does not exist'
-        );
-
-    tree_table_name = btrim(lower(result_tree_table_name), ' ');
-    PERFORM MADLIB_SCHEMA.__check_dt_common_params
-        (
-            split_criterion,
-            training_table_name,
-            tree_table_name,
-            continuous_feature_names,
-            feature_col_names,
-            id_col_name,
-            class_col_name,
-            how2handle_missing_value,
-            max_tree_depth,
-            node_prune_threshold,
-            node_split_threshold,
-            verbosity,
-            'tree'
-        );
-
-    train_rs = MADLIB_SCHEMA.__encode_and_train
-        (
-            'C4.5',
-            split_criterion,
-            1,
-            NULL,
-            training_table_name,
-            validation_table_name,
-            tree_table_name,
-            continuous_feature_names,
-            feature_col_names,
-            id_col_name,
-            class_col_name,
-            confidence_level,
-            how2handle_missing_value,
-            max_tree_depth,
-            1.0,
-            'f',
-            node_prune_threshold,
-            node_split_threshold,
-            '<tree_schema_name>_<tree_table_name>',
-            verbosity
-        );
-
-    IF ( verbosity > 0 ) THEN
-            RAISE INFO 'Training Total Time: %',
-                clock_timestamp() - begin_func_exec;
-            RAISE INFO 'training result:%', train_rs;
-    END IF;
-
-    ret.training_set_size   = train_rs.num_of_samples;
-    ret.tree_nodes          = train_rs.num_tree_nodes;
-    ret.tree_depth          = train_rs.max_tree_depth;
-    ret.training_time       = clock_timestamp() - begin_func_exec;
-    ret.split_criterion     = split_criterion;
-
-    m4_changequote(`<!', `!>')
-    m4_ifdef(<!__HAWQ__!>, <!
-    EXECUTE 'SET optimizer = '|| old_optimizer;
-    !>)
-    m4_changequote(<!`!>, <!'!>)
-
-    RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief C45 train algorithm in short form.
- *
- * @param split_criterion          The name of the split criterion that should be used
- *                                  for tree construction. Possible values are
- *                                  ‘gain’, ‘gainratio’, and ‘gini’.
- * @param training_table_name       The name of the table/view with the source data.
- * @param result_tree_table_name    The name of the table where the resulting DT
- *                                  will be kept.
- * @param validation_table_name     The name of the table/view that contains the validation
- *                                  set used for tree pruning. The default is NULL, in which
- *                                  case we will not do tree pruning.
- * @param continuous_feature_names  A comma-separated list of the names of features whose values
- *                                  are continuous. The default is null, which means there are
- *                                  no continuous features in the training table.
- * @param feature_col_names         A comma-separated list of the names of table columns, each of
- *                                  which defines a feature. The default value is null, which means
- *                                  all the columns in the training table, except columns named
- *                                   ‘id’ and ‘class’, will be used as features.
- * @param id_col_name               The name of the column containing an ID for each record.
- * @param class_col_name            The name of the column containing the labeled class.
- * @param confidence_level          A statistical confidence interval of the
- *                                  resubstitution error.
- * @param how2handle_missing_value  The way to handle missing value. The valid value
- *                                  is 'explicit' or 'ignore'.
- *
- * @return An c45_train_result object.
- *
- * @note
- * This calls the long form of C45 with the following default parameters:
- * - max_tree_deapth        := 10
- * - node_prune_threshold   := 0.001
- * - node_split_threshold   := 0.01
- * - verbosity              := 0
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
-    (
-    split_criterion             TEXT,
-    training_table_name         TEXT,
-    result_tree_table_name      TEXT,
-    validation_table_name       TEXT,
-    continuous_feature_names    TEXT,
-    feature_col_names           TEXT,
-    id_col_name                 TEXT,
-    class_col_name              TEXT,
-    confidence_level            FLOAT,
-    how2handle_missing_value    TEXT
-    )
-RETURNS MADLIB_SCHEMA.c45_train_result AS $$
-DECLARE
-    ret       MADLIB_SCHEMA.c45_train_result;
-BEGIN
-    ret = MADLIB_SCHEMA.c45_train
-            (
-                split_criterion,
-                training_table_name,
-                result_tree_table_name,
-                validation_table_name ,
-                continuous_feature_names ,
-                feature_col_names ,
-                id_col_name ,
-                class_col_name ,
-                confidence_level,
-                how2handle_missing_value,
-                10,
-                0.001,
-                0.01,
-                0
-            );
-
-    RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief C45 train algorithm in short form.
- *
- * @param split_criterion           The name of the split criterion that should be used
- *                                  for tree construction. Possible values are
- *                                  ‘gain’, ‘gainratio’, and ‘gini’.
- * @param training_table_name       The name of the table/view with the source data.
- * @param result_tree_table_name    The name of the table where the resulting DT
- *                                  will be kept.
- *
- * @return An c45_train_result object.
- *
- * @note
- * This calls the above short form of C45 with the following default parameters:
- * - validation_table_name      := NULL
- * - continuous_feature_names   := NULL
- * - id_column_name             := 'id'
- * - class_column_name          := 'class'
- * - confidence_level           := 25
- * - how2handle_missing_value   := 'explicit'
- * - max_tree_deapth            := 10
- * - node_prune_threshold       := 0.001
- * - node_split_threshold       := 0.01
- * - verbosity                  := 0
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_train
-    (
-    split_criterion         TEXT,
-    training_table_name     TEXT,
-    result_tree_table_name  TEXT
-    )
-RETURNS MADLIB_SCHEMA.c45_train_result AS $$
-DECLARE
-    ret MADLIB_SCHEMA.c45_train_result;
-BEGIN
-    ret =  MADLIB_SCHEMA.c45_train
-            (
-                split_criterion,
-                training_table_name,
-                result_tree_table_name,
-                null,
-                null,
-                null,
-                'id',
-                'class',
-                25,
-                'explicit'
-            );
-
-    RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Display the trained decision tree model with rules.
- *
- * @param   tree_table_name The name of the table containing the tree's information.
- * @param   verbosity       If >= 1 will run in verbose mode.
- *
- * @return The rule representation text for a decision tree.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
-    (
-    tree_table_name TEXT,
-    verbosity       INT
-    )
-RETURNS SETOF TEXT AS $$
-DECLARE
-    metatable_name          TEXT;
-    classtable_name         TEXT;
-    class_column_name       TEXT;
-    rec                     RECORD;
-    fvalue_stmt             TEXT;
-    feature_rule            TEXT;
-    curstmt                 TEXT;
-    union_stmt              TEXT := NULL;
-    exec_begin              TIMESTAMP;
-    exec_leaves_rule        INTERVAL;
-    exec_internode_rule     INTERVAL;
-    exec_union              INTERVAL;
-BEGIN
-
-    IF (verbosity < 1) THEN
-        -- get rid of the messages whose severity level is lower than 'WARNING'
-        SET client_min_messages = WARNING;
-    END IF;
-
-    PERFORM MADLIB_SCHEMA.__assert
-            (
-                (tree_table_name IS NOT NULL) AND
-                (
-                 MADLIB_SCHEMA.__table_exists
-                    (
-                        tree_table_name
-                    )
-                ),
-                'the specified tree table'      ||
-                coalesce('<'                    ||
-                tree_table_name                 ||
-                '> does not exists', ' is NULL')
-            );
-
-    PERFORM MADLIB_SCHEMA.__assert
-            (
-                verbosity IS NOT NULL,
-                'verbosity must be non-null'
-            );
-
-    IF (verbosity > 0 ) THEN
-        exec_begin          = clock_timestamp();
-        exec_leaves_rule    = exec_begin - exec_begin;
-        exec_union          = exec_leaves_rule;
-        exec_internode_rule = exec_leaves_rule;
-    END IF;
-
-    -- get metatable and classtable name given the tree table name
-    metatable_name = tree_table_name || '_di';
-    classtable_name   = MADLIB_SCHEMA.__get_classtable_name(metatable_name);
-    class_column_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
-
-    curstmt = MADLIB_SCHEMA.__format
-                (
-                    'SELECT id, max_class, probability,
-                            num_of_samples, lmc_nid, lmc_fval
-                     FROM %
-                     WHERE id = 1',
-                    ARRAY[
-                        tree_table_name
-                    ]
-                );
-
-    EXECUTE  curstmt INTO rec;
-
-    -- in sample the root node is leaf
-    IF (rec.lmc_nid IS NULL) THEN
-        RETURN NEXT 'All instances will be classified to class '    ||
-                     MADLIB_SCHEMA.__get_class_value
-                        (rec.maxclass, metatable_name)              ||
-                     ' ['                                           ||
-                     (rec.probability * rec.sample_size)::BIGINT       ||
-                     '/'                                            ||
-                     rec.sample_size                                   ||
-                     ']';
-        RETURN;
-    END IF;
-
-    -- get the meta info for features in the tree table (as best split)
-    curstmt = MADLIB_SCHEMA.__format
-                (
-                    'SELECT
-                        id,
-                        column_name,
-                        MADLIB_SCHEMA.__regclass_to_text
-                            (table_oid) as table_name,
-                        is_cont
-                     FROM
-                        % n1
-                     WHERE id IN
-                        (SELECT DISTINCT feature
-                         FROM %
-                         WHERE lmc_nid IS NOT NULL
-                        )',
-                    ARRAY[
-                        metatable_name,
-                        tree_table_name
-                    ]
-                );
-
-    -- put all the features' value together using 'union all'
-    FOR rec IN EXECUTE curstmt LOOP
-        -- continuous feature will produce two rows
-        IF (rec.is_cont) THEN
-            SELECT MADLIB_SCHEMA.__format
-                (
-                'SELECT % as fid, 1 as key,
-                       ''% <= ''::TEXT as fname, null::text as fval
-                 UNION ALL
-                 SELECT % as fid, 2 as key, ''% > ''::TEXT as fname,
-                        null::text as fval',
-                ARRAY[
-                    rec.id::TEXT,
-                    rec.column_name,
-                    rec.id::TEXT,
-                    rec.column_name
-                    ]
-                ) INTO fvalue_stmt;
-
-        -- discrete feature will produce the number of rows
-        -- which is the same with distinct values
-        ELSE
-            SELECT MADLIB_SCHEMA.__format
-                (
-                'SELECT % as fid, key, ''% = ''::TEXT as fname,
-                        MADLIB_SCHEMA.__to_char(%) as fval
-                 FROM %
-                 WHERE key IS NOT NULL',
-                ARRAY[
-                    rec.id::TEXT,
-                    rec.column_name,
-                    rec.column_name,
-                    rec.table_name
-                    ]
-                )
-            INTO fvalue_stmt;
-        END IF;
-
-        IF (union_stmt IS NULL) THEN
-            union_stmt = fvalue_stmt;
-        ELSE
-            union_stmt = union_stmt || ' UNION ALL ' || fvalue_stmt;
-        END IF;
-    END LOOP;
-
-    IF (verbosity > 0 ) THEN
-        exec_union = clock_timestamp() - exec_begin;
-        RAISE INFO 'compose feature values statement time:%', exec_union;
-        RAISE INFO 'feature info stmt: %', curstmt;
-        RAISE INFO 'feature value stmt: %', union_stmt;
-    END IF;
-
-    -- put the rules for leaves into a temp table
-    DROP TABLE IF EXISTS c45_gen_rules_leaves;
-    SELECT MADLIB_SCHEMA.__format
-        (
-        'CREATE TEMP TABLE c45_gen_rules_leaves as
-         SELECT
-            id,
-                ''  then class ''                   ||
-                class::TEXT                         ||
-                '' [''                              ||
-                (probability * num_of_samples)::BIGINT    ||
-                ''/''                               ||
-                num_of_samples                      ||
-                '']''
-            as str,
-            array_to_string(tree_location, '''') as location,
-            1 as rlid
-         FROM
-            (SELECT id, max_class, tree_location, probability, num_of_samples
-             FROM %
-             WHERE lmc_nid IS NULL
-            ) n1
-            LEFT JOIN
-            (SELECT fval as class, code
-             FROM %
-             WHERE code IS NOT NULL
-            ) n2
-            ON n1.max_class = n2.code
-         m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (location)')',
-        ARRAY[
-            tree_table_name,
-            classtable_name
-            ]
-        )
-    INTO curstmt;
-
-    EXECUTE curstmt;
-
-    IF (verbosity > 0 ) THEN
-        exec_leaves_rule = clock_timestamp() - exec_begin;
-        RAISE INFO 'create table for leaves'' rules time:%',
-            exec_leaves_rule - exec_union;
-        RAISE INFO 'create tablefor leaves stmt: %', curstmt;
-    END IF;
-
-    DROP TABLE IF EXISTS c45_gen_rules_internode;
-    -- put rules of the internal nodes into a table
-    SELECT MADLIB_SCHEMA.__format
-        (
-            'CREATE TEMP TABLE c45_gen_rules_internode AS
-            SELECT
-                lmc_nid + (key - lmc_fval) AS id,
-                CASE WHEN (id = 1) THEN
-                        ''  if ''   ||
-                        fname       ||
-                        COALESCE(split_value::TEXT,
-                                 MADLIB_SCHEMA.__to_char(fval), ''NULL'')
-                     ELSE
-                        ''     ''   ||
-                        fname       ||
-                        COALESCE(split_value::TEXT,
-                                 MADLIB_SCHEMA.__to_char(fval), ''NULL'')
-                END AS str,
-                array_to_string(tree_location, '''') || key AS location,
-                0  AS rlid
-            FROM
-                (SELECT id, feature, tree_location,
-                        lmc_nid, lmc_fval, split_value
-                 FROM %
-                 WHERE lmc_nid IS NOT NULL
-                ) n1
-                LEFT JOIN
-                (%) n2
-                ON n1.feature = n2.fid
-            WHERE
-                (lmc_nid + key - lmc_fval) IN (SELECT id from %)
-            m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (location)')',
-            ARRAY[
-                tree_table_name,
-                union_stmt,
-                tree_table_name
-                ]
-        ) INTO curstmt;
-    EXECUTE curstmt;
-
-    IF (verbosity > 0 ) THEN
-        exec_internode_rule = clock_timestamp() - exec_begin;
-        RAISE INFO 'create table for internal nodes'' rules time:%',
-            exec_internode_rule - exec_leaves_rule;
-        RAISE INFO 'create tablefor internal nodes stmt: %', curstmt;
-    END IF;
-
-    FOR rec IN EXECUTE '
-                  SELECT t1.id, t1.rlid, t2.location, t1.str
-                  FROM
-                    c45_gen_rules_internode t1
-                    LEFT JOIN
-                    c45_gen_rules_leaves t2
-                    ON position(t1.location in t2.location) = 1
-                  UNION ALL
-                  SELECT id, rlid, location, str
-                  FROM  c45_gen_rules_leaves n
-                  ORDER BY location, rlid, id'
-                  LOOP
-        RETURN NEXT rec.str;
-    END LOOP;
-
-    IF (verbosity > 0 ) THEN
-        RAISE INFO 'Total rules generation time:%',
-            clock_timestamp() - exec_begin;
-    END IF;
-
-    RETURN;
-END $$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Display the trained decision tree model with rules.
- *
- * @param tree_table_name  The name of the table containing the tree's information.
- *
- * @return The rule representation text for a decision tree.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_genrule
-    (
-    tree_table_name TEXT
-    )
-RETURNS SETOF TEXT AS $$
-DECLARE
-    str             TEXT;
-BEGIN
-    -- run in non-verbose mode
-    FOR str IN EXECUTE
-                'SELECT *
-                 FROM MADLIB_SCHEMA.c45_genrule
-                 (' || coalesce('''' || tree_table_name || '''', 'NULL') || ', 0)'
-    LOOP
-        RETURN NEXT str;
-    END LOOP;
-
-    RETURN;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Display the trained decision tree model with human readable format.
- *
- * @param tree_table The name of the table containing the tree's information.
- * @param max_depth  The max depth to be displayed. If null, this function
- *                   will show all levels.
- *
- * @return The text representing the tree with human readable format.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
-    (
-    tree_table  TEXT,
-    max_depth   INT
-    )
-RETURNS SETOF TEXT AS $$
-DECLARE
-    tids    INT[] := ARRAY[1];
-    str     TEXT;
-BEGIN
-    -- get rid of the messages whose severity level is lower than 'WARNING'
-    SET client_min_messages = WARNING;
-
-    PERFORM MADLIB_SCHEMA.__assert
-            (
-                (tree_table IS NOT NULL) AND
-                (
-                 MADLIB_SCHEMA.__table_exists
-                    (
-                        tree_table
-                    )
-                ),
-                'the specified tree table'      ||
-                coalesce('<'                    ||
-                tree_table                      ||
-                '> does not exists', ' is NULL')
-            );
-
-    FOR str IN SELECT * FROM
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAS_ORDERED_AGGREGATES__!>, <!
-         MADLIB_SCHEMA.__treemodel_display_with_ordered_aggr
-                                (tree_table,tids,max_depth) LOOP
-!>, <!
-         MADLIB_SCHEMA.__treemodel_display_no_ordered_aggr
-                                (tree_table,tids,max_depth) LOOP
-!>)
-m4_changequote(<!`!>, <!'!>)
-        RETURN NEXT str;
-    END LOOP;
-    RETURN;
-END $$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Display the whole trained decision tree model with human readable format.
- *
- * @param tree_table: The name of the table containing the tree's information.
- *
- * @return The text representing the tree with human readable format.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_display
-    (
-    tree_table  TEXT
-    )
-RETURNS SETOF TEXT AS $$
-DECLARE
-    str     TEXT;
-BEGIN
-    FOR str IN SELECT * FROM MADLIB_SCHEMA.c45_display(tree_table,NULL) LOOP
-        RETURN NEXT str;
-    END LOOP;
-    RETURN;
-END $$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Classify dataset using trained decision tree model.
- * The classification result will be stored in the table which is defined 
- * as: 
- *
- *  CREATE TABLE classification_result
- *  (
- *     id        INT|BIGINT,
- *     class     SUPPORTED_DATA_TYPE,
- *     prob      FLOAT
- *  );
- *
- * @param tree_table_name           The name of trained tree.
- * @param classification_table_name The name of the table/view with the source data.
- * @param result_table_name         The name of result table.
- * @param verbosity                 > 0 means this function runs in verbose mode.
- *
- * @return A c45_classify_result object.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
-    (
-    tree_table_name             TEXT,
-    classification_table_name   TEXT,
-    result_table_name           TEXT,
-    verbosity                   INT
-    )
-RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
-DECLARE
-    encoded_table_name  TEXT := '';
-    begin_time          TIMESTAMP;
-    ret                 MADLIB_SCHEMA.c45_classify_result;
-    temp_result_table   TEXT := '';
-    metatable_name      TEXT;
-    result_rec          RECORD;
-    curstmt             TEXT;
-    table_names         TEXT[];
-    swap_table          TEXT;
-BEGIN
-    IF (verbosity < 1) THEN
-        -- get rid of the messages whose severity level is lower than 'WARNING'
-        SET client_min_messages = WARNING;
-    END IF;
-
-    begin_time = clock_timestamp();
-
-    PERFORM MADLIB_SCHEMA.__assert
-            (
-                (result_table_name IS NOT NULL) AND
-                (
-                 NOT MADLIB_SCHEMA.__table_exists
-                    (
-                        result_table_name
-                    )
-                ),
-                'the specified result table' || coalesce('<' || result_table_name || '> exists', ' is NULL')
-            );
-
-    table_names = MADLIB_SCHEMA.__treemodel_classify_internal
-                  (
-                    classification_table_name,
-                    tree_table_name,
-                    verbosity
-                  );
-
-    encoded_table_name = table_names[1];
-    temp_result_table  = table_names[2];
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
-    SELECT MADLIB_SCHEMA.__unique_string() INTO swap_table;
-    EXECUTE '
-        DROP TABLE IF EXISTS ' || swap_table || ';
-        CREATE TEMP TABLE ' || swap_table || ' AS
-        SELECT * FROM ' || temp_result_table || '
-        WHERE tid = 1';
-    EXECUTE '
-        DROP TABLE IF EXISTS ' || temp_result_table || ';
-        ALTER TABLE ' || swap_table || ' RENAME TO ' || temp_result_table;
-!>, <!
-    EXECUTE 'DELETE FROM '||temp_result_table||' WHERE tid <> 1;';
-!>)
-m4_changequote(<!`!>, <!'!>)
-
-    metatable_name = tree_table_name || '_di';
-
-    curstmt = MADLIB_SCHEMA.__format
-                (
-                    'SELECT
-                        column_name,
-                        table_name
-                     FROM %
-                     WHERE column_type=''c'' LIMIT 1',
-                    ARRAY[
-                        metatable_name
-                    ]
-                );
-
-    EXECUTE curstmt INTO result_rec;
-
-   -- translate the encoded class information back
-    curstmt = MADLIB_SCHEMA.__format
-                (
-                    'CREATE TABLE % AS SELECT n.id, m.fval as class, n.prob
-                     From % n, % m
-                     WHERE n.class = m.code
-                     m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)');',
-                    ARRAY[
-                        result_table_name,
-                        temp_result_table,
-                        result_rec.table_name
-                    ]
-                );
-    EXECUTE curstmt;
-
-    EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ';';
-    EXECUTE 'DROP TABLE IF EXISTS ' || temp_result_table || ';';
-    EXECUTE 'SELECT COUNT(*) FROM ' ||classification_table_name||';'
-            INTO ret.input_set_size;
-
-    ret.classification_time = clock_timestamp() - begin_time;
-
-    RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Classify dataset using trained decision tree model. It runs in quiet
- *        mode. The classification result will be stored in the table which is
- *        defined as:
- *
- *          CREATE TABLE classification_result
- *          (
- *              id        INT|BIGINT,
- *              class     SUPPORTED_DATA_TYPE,
- *              prob      FLOAT
- *          );
- *
- * @param tree_table_name           The name of trained tree.
- * @param classification_table_name The name of the table/view with the source data.
- * @param result_table_name         The name of result table.
- *
- * @return A c45_classify_result object.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_classify
-    (
-    tree_table_name             TEXT,
-    classification_table_name   TEXT,
-    result_table_name           TEXT
-    )
-RETURNS MADLIB_SCHEMA.c45_classify_result AS $$
-DECLARE
-    ret MADLIB_SCHEMA.c45_classify_result;
-BEGIN
-    -- get rid of the messages whose severity level is lower than 'WARNING'
-    SET client_min_messages = WARNING;
-
-    ret = MADLIB_SCHEMA.c45_classify
-           (
-               tree_table_name,
-               classification_table_name,
-               result_table_name,
-               0
-           );
-
-    RETURN ret;
-END $$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Check the accuracy of the decision tree model.
- *
- * @param tree_table_name     The name of the trained tree.
- * @param scoring_table_name  The name of the table/view with the source data.
- * @param verbosity           > 0 means this function runs in verbose mode.
- *
- * @return The estimated accuracy information.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
-    (
-    tree_table_name             TEXT,
-    scoring_table_name          TEXT,
-    verbosity                   INT
-    )
-RETURNS FLOAT8 AS $$
-DECLARE
-    accuracy        FLOAT8;
-BEGIN
-    accuracy = MADLIB_SCHEMA.__treemodel_score
-                (
-                    tree_table_name,
-                    scoring_table_name,
-                    verbosity
-                );
-    RETURN accuracy;
-END;
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Check the accuracy of the decision tree model.
- *
- * @param tree_table_name     The name of the trained tree.
- * @param scoring_table_name  The name of the table/view with the source data.
- *
- * @return The estimated accuracy information.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_score
-    (
-    tree_table_name             TEXT,
-    scoring_table_name          TEXT
-    )
-RETURNS FLOAT8 AS $$
-DECLARE
-    accuracy        FLOAT8;
-BEGIN
-    accuracy = MADLIB_SCHEMA.__treemodel_score
-                (
-                    tree_table_name,
-                    scoring_table_name,
-                    0
-                );
-    RETURN accuracy;
-END;
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/**
- * @brief Cleanup the trained tree table and any relevant tables.
- *
- * @param result_tree_table_name The name of the table containing
- *                               the tree's information.
- *
- * @return The status of that cleanup operation.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.c45_clean
-    (
-    result_tree_table_name TEXT
-    )
-RETURNS BOOLEAN AS $$
-DECLARE
-    result BOOLEAN;
-BEGIN
-    EXECUTE 'DROP TABLE IF EXISTS ' || result_tree_table_name || '_di';
-    EXECUTE 'DROP TABLE IF EXISTS ' || result_tree_table_name || '_ed';
-    EXECUTE 'DROP TABLE IF EXISTS ' || result_tree_table_name || '_ed_col';
-    EXECUTE 'DROP TABLE IF EXISTS ' || result_tree_table_name || '_ed_class';
-    EXECUTE 'DROP TABLE IF EXISTS ' || result_tree_table_name || '_summary';
-    EXECUTE 'DROP TABLE IF EXISTS ' || result_tree_table_name;
-    DROP TABLE IF EXISTS classified_instance_ping;
-    DROP TABLE IF EXISTS classified_instance_pong;
-
-    RETURN TRUE;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');