You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/04/01 03:21:28 UTC
[09/11] incubator-madlib git commit: Build: Add support for HAWQ 2.0

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/96f9ac04/methods/cart/src/pg_gp/dt.sql_in
----------------------------------------------------------------------
diff --git a/methods/cart/src/pg_gp/dt.sql_in b/methods/cart/src/pg_gp/dt.sql_in
deleted file mode 100644
index b77fa67..0000000
--- a/methods/cart/src/pg_gp/dt.sql_in
+++ /dev/null
@@ -1,4299 +0,0 @@
-/* ----------------------------------------------------------------------- *//**
- *
- * @file dt.sql_in
- *
- * @brief the common functions written in PL/PGSQL shared by C4.5 and RF
- * @date April 5, 2012
- *
- *//* ----------------------------------------------------------------------- */
-
-m4_include(`SQLCommon.m4')
-
-m4_ifelse(
-    m4_eval(
-        m4_ifdef(`__GREENPLUM__', 1, 0) &&
-        __DBMS_VERSION_MAJOR__ * 10000 +
-            __DBMS_VERSION_MINOR__ * 100 +
-            __DBMS_VERSION_PATCH__ >= 40201
-    ), 1,
-    `m4_define(`__GREENPLUM_GE_4_2_1__')'
-)
-
-/*
- * @brief Test if the given table is a valid encoded one.
- *        A valid encoded table has the following characteristic:
- *            + It has 5 columns, whose names are id, fid, fval,
- *              is_cont and class.
- *            + The types of the 5 columns are BIGINT, INT, FLOAT8
- *              BOOL and INT.
- *
- * @param enc_tbl_name    The full name of the encoded table.
- *
- * @return Ture if the given table is a valid encoded one.
- *         False if it's an invalid encoded table.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
-    (
-    enc_tbl_name TEXT
-    )
-RETURNS BOOL AS $$
-DECLARE
-    num_enc_table  INT;
-    num_cols       INT;
-    ret            BOOL := 'f'::BOOL;
-BEGIN
-    -- test if the name and the type of a column are valid or not
-    SELECT count(*)
-    FROM pg_attribute
-    WHERE attrelid= enc_tbl_name::regclass::oid AND
-          attnum > 0 AND
-          not attisdropped AND
-          attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND
-          atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype,
-                       'bool'::regtype, 'int'::regtype)
-    INTO num_cols;
-
-    IF (num_cols = 5) THEN
-        ret = 't'::BOOL;
-    END IF;
-
-    RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
-    (
-    result      FLOAT8[],    -- intermediate result
-    scv         FLOAT8[],
-    fid         INT,
-    split_value FLOAT8
-    )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
-    (
-    sfunc1_result     FLOAT8[],
-    sfunc2_result     FLOAT8[]
-    )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
-    (
-    FLOAT8[],       -- scv
-    INT,            -- fid
-    FLOAT8          -- split_value
-    ) CASCADE;
-CREATE
-AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
-    (
-    FLOAT8[],       -- scv
-    INT,            -- fid
-    FLOAT8          -- split_value
-    )
-(
-  SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
-  m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
-  STYPE=FLOAT8[],
-  initcond = '{0, 0, 0, 0, 0, 0, 0}'
-);
-
-
-/*
- * @brief The step function is defined to process each record in the ACS set.
- *        The records have this format:
- *        {fid, fval, is_cont, split_value, le, total, tid, nid}
- *
- * @param result            The array used to keep the best attribute's info.
- * @param sc_code           The code of the split criterion.
- * @param is_cont           True  - The feature is continuous.
- *                          False - The feature is discrete.
- * @param num_class         The total number of classes.
- * @param le_array          The le component of the ACS record. le_array[i] is the
- *                          number of samples whose class code equals to i and
- *                          whose fval is less-than or equal to the fval component
- *                          of the ACS record being processed.
- * @param total_array       The total component of the ACS record. total_array[i] is
- *                          the number of samples whose class code equals to i.
- * @param true_total        The real total number of samples currently assigned to
- *                          the node identified by (tid, nid). If there are missing
- *                          values in fval, the sum of all elements in total_array
- *                          will be less than true_total.
- *
- * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
- *         in dt.c for the detailed information of this array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
-    (
-    result          FLOAT8[],
-    sc_code         INT,
-    is_cont         BOOLEAN,
-    num_class       INT,
-    le_array        FLOAT8[],
-    total_array     FLOAT8[],
-    true_total      BIGINT
-    )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The pre-function for the aggregation of splitting criteria values. It
- *        takes the state array produced by two sfunc and combine them together.
- *
- * @param sfunc1_result     The array from sfunc1.
- * @param sfunc2_result     The array from sfunc2.
- *
- * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
- *         in dt.c for the detailed information of this array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
-    (
-    sfunc1_result     FLOAT8[],
-    sfunc2_result     FLOAT8[]
-    )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The final function for the aggregation of splitting criteria values.
- *        It takes the state array produced by the sfunc and produces a
- *        5-element array.
- *
- * @param internal_result   The 9-element array produced by dt_scv_aggr_prefunc
- *
- * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX
- *         in dt.c for the detailed information of this array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
-    (
-    internal_result     FLOAT8[]
-    )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
-    (
-    INT,        -- sc
-    BOOLEAN,    -- is_cont
-    INT,        -- total number of classes
-    FLOAT8[],   -- le array
-    FLOAT8[],   -- total count array
-    BIGINT      -- the total number of samples
-    ) CASCADE;
-CREATE
-AGGREGATE MADLIB_SCHEMA.__scv_aggr
-    (
-    INT,        -- sc
-    BOOLEAN,    -- is_cont
-    INT,        -- total number of classes
-    FLOAT8[],   -- le array
-    FLOAT8[],   -- total count array
-    BIGINT      -- the total number of samples
-    )
-(
-  SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
-  m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
-  FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
-  STYPE=FLOAT8[],
-  initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'
-  -- 1   sc: 1 infogain, 2 gainratio, 3 gini
-  -- 2   is_cont
-  -- 3   scv_class_info
-  -- 4   scv_attr_info
-  -- 5   scv_class_attr_info
-  -- 6   scv_count
-  -- 7   scv_total
-  -- 8   max_class_id
-  -- 9   max_class_count
-);
-
-
-/*
- * @brief Retrieve the specified number of unique features for a node.
- *        Discrete features used by ancestor nodes will be excluded.
- *        If the number of remaining features is less or equal than the
- *        requested number of features, then all the remaining features
- *        will be returned. Otherwise, we will sample the requested
- *        number of features from the remaining features.
- *
- * @param num_req_features  The number of requested features.
- * @param num_features      The total number of features.
- * @param nid               The ID of the node for which the
- *                          features are sampled.
- * @param dp_fids           The IDs of the discrete features
- *                          used by the ancestors.
- *
- * @return An array containing all the IDs of chosen features.
- *
- */
-CREATE OR REPLACE FUNCTION
-MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
-RETURNS INT[]
-AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
-LANGUAGE C VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief Retrieve the selected features for a node. We will create a table, named
- *        sf_association, to store the association between selected feature IDs and
- *        node IDs.
- *
- * @param nid_table_name    The full name of the table which contains all the
- *                          node IDs.
- * @param result_table_name The full name of the table which contains the parent
- *                          discrete features for each node.
- * @param num_chosen_fids   The number of feature IDs will be chosen for a node.
- * @param total_num_fids    The total number of feature IDs, total_num_fids
- *                          >= num_chosen_fids.
- *                          If num_chosen_fids < total_num_fids, then we will
- *                          randomly select num_chosen_fids features from all
- *                          the features. Otherwise, we will return all the
- *                          features exception they belong to the parent discrete
- *                          features for a node.
- * @param verbosity         > 0 means this function runs in verbose mode.
- *
- * @return An constant string for the association table name.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
-    (
-    nid_table_name        TEXT,
-    result_table_name     TEXT,
-    num_chosen_fids       INT,
-    total_num_fids        INT,
-    verbosity             INT
-    )
-RETURNS TEXT AS $$
-DECLARE
-    curstmt     TEXT;
-BEGIN
-    -- The sf_association table records which features are used
-    -- for finding the best split for a node.
-    -- It has two columns:
-    --      nid -- The id of a node.
-    --      fid -- The id of a feature.
-    EXECUTE 'TRUNCATE sf_assoc';
-
-    curstmt = MADLIB_SCHEMA.__format
-                (
-                    'INSERT INTO sf_assoc(nid, fid)
-                     SELECT
-                       nid,
-                       unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
-                                nid,dp_ids)) as fid
-                     FROM (SELECT nid, dp_ids
-                           FROM % s1, % s2
-                           WHERE s1.nid = s2.id
-                           GROUP BY nid, dp_ids) t',
-                    ARRAY[
-                        num_chosen_fids::TEXT,
-                        total_num_fids::TEXT,
-                        nid_table_name,
-                        result_table_name
-                        ]
-                );
-
-     IF (verbosity > 0) THEN
-        RAISE INFO 'build sample feature association stmt: %', curstmt;
-     END IF;
-
-     EXECUTE curstmt;
-
-     -- we return an constant string for the association table name
-     return 'sf_assoc';
-
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * This UDT is used to keep the times of generating acc.
- *
- * calc_pre_time   The time of pre-processing.
- * calc_acc_time   The time of calculating acc.
- *
- */
-DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time CASCADE;
-CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
-(
-    calc_pre_time       INTERVAL,
-    calc_acc_time       INTERVAL
-);
-
-
-/*
- * @brief Generate the ACC for current leaf nodes.
- *
- * @param encoded_table_name    The full name of the encoded table for the
- *                              training table.
- * @param metatable_name        The full name of the metatable contains the
- *                              relevant information of the input table.
- * @param result_table_name     The full name of the training result table.
- * @param num_featrue_try       The number of features will be chosen per node.
- * @param num_classes           Total number of classes in training set.
- * @param verbosity             > 0 means this function runs in verbose mode.
- *
- * @return The time information for generating ACC.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
-    (
-    encoded_table_name      TEXT,
-    metatable_name          TEXT,
-    result_table_name       TEXT,
-    tr_table_name           TEXT,
-    sf_table_name           TEXT,
-    num_featrue_try         INT,
-    num_classes             INT,
-    sampling_needed         BOOLEAN,
-    verbosity               INT
-    )
-RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
-DECLARE
-    curstmt             TEXT := '';
-    num_fids            INT  := 1;
-    begin_calc_acc      TIMESTAMP;
-    begin_calc_pre      TIMESTAMP;
-    ret                 MADLIB_SCHEMA.__gen_acc_time;
-    select_stmt         TEXT;
-BEGIN
-    begin_calc_pre = clock_timestamp();
-
-    -- get the number of features
-    curstmt = MADLIB_SCHEMA.__format
-                (
-                'SELECT COUNT(id)
-                FROM %
-                WHERE column_type = ''f''',
-                metatable_name
-                );
-    EXECUTE curstmt INTO num_fids;
-
-    -- preprocessing time
-    ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
-    begin_calc_acc    = clock_timestamp();
-
-    IF (sampling_needed) THEN
-        PERFORM MADLIB_SCHEMA.__get_features_of_nodes
-            (
-                tr_table_name,
-                result_table_name,
-                num_featrue_try,
-                num_fids,
-                verbosity
-            );
-
-        select_stmt =  MADLIB_SCHEMA.__format
-             (
-                'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
-                        ed.class, sum(weight) as count
-                 FROM % ed, % tr, % sf
-                 WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id
-                 GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
-                            ed.is_cont, ed.class',
-               ARRAY[
-                   encoded_table_name,
-                   tr_table_name,
-                   sf_table_name
-               ]
-            );
-    ELSE
-        select_stmt =  MADLIB_SCHEMA.__format
-             (
-                'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
-                        ed.class, sum(weight) as count
-                 FROM % ed, % tr
-                 WHERE ed.id = tr.id
-                 GROUP BY   tr.tid, tr.nid, ed.fid, ed.fval,
-                            ed.is_cont, ed.class',
-               ARRAY[
-                   encoded_table_name,
-                   tr_table_name
-               ]
-            );
-    END IF;
-    DROP TABLE IF EXISTS training_instance_aux;
-    m4_changequote(`<!', `!>')
-    curstmt = MADLIB_SCHEMA.__format
-        (
-            'CREATE TEMP TABLE training_instance_aux AS
-             SELECT tid, nid, fid, fval, is_cont,
-                    MADLIB_SCHEMA.__dt_acc_count_aggr
-                        (%,count::BIGINT,class::INT) AS count
-             FROM
-             (
-                 %
-             ) l
-             GROUP BY tid,nid,fid, fval,is_cont
-             m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid, fval)!>)',
-            ARRAY[
-                num_classes::TEXT,
-                select_stmt
-            ]
-        );
-    m4_changequote(<!`!>, <!'!>)
-
-    IF ( verbosity>0 ) THEN
-        RAISE INFO '%', curstmt;
-    END IF;
-
-    EXECUTE curstmt;
-    ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
-
-    RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
-CREATE TYPE MADLIB_SCHEMA.__rep_type AS
-    (
-    numOfOrgClasses BIGINT[]
-    );
-
-
-/*
- * @brief The step function for aggregating the class counts while doing Reduce
- *        Error Pruning (REP).
- *
- * @param class_count_array     The array used to store the accumulated information.
- *                              [0]: the total number of mis-classified samples.
- *                              [i]: the number of samples belonging to the ith class.
- * @param classified_class      The predicted class based on our trained DT model.
- * @param original_class        The real class value provided in the validation set.
- * @param max_num_of_classes    The total number of distinct class values.
- *
- * @return An updated class count array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
-    (
-    class_count_array       BIGINT[],
-    classified_class        INT,
-    original_class          INT,
-    max_num_of_classes      INT
-    )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief Add the corresponding elements of the input arrays
- *        to create a new one.
- *
- * @param 1 arg     The array 1.
- * @param 2 arg     The array 2.
- *
- * @return The new array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
-    (
-    BIGINT[],
-    BIGINT[]
-    )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'bigint_array_add'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The final function for aggregating the class counts for REP.
- *        It takes the class count array produced by the sfunc and produces a
- *        two-element array. The first element is the ID of the class that has
- *        the maximum number of samples represented by the root node of the subtree
- *        being processed. The second element is the number of reduced
- *        misclassified samples if the leave nodes of the subtree are pruned.
- *
- * @param class_count_data     The array containing all the information for the
- *                             calculation of Reduced-Error pruning.
- *
- * @return A two element array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
-    (
-    class_count_array       BIGINT[]
-    )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
-    (
-    INT,
-    INT,
-    INT
-    );
-CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
-    (
-    INT,
-    INT,
-    INT
-    )
-(
-  SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
-  m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
-  FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
-  STYPE=BIGINT[]
-);
-
-
-/*
- * @brief The step function of the aggregate __array_indexed_agg.
- *
- * @param state         The step state array of the aggregate function.
- * @param elem          The element to be filled into the state array.
- * @param elem_cnt      The number of elements.
- * @param elem_idx      the subscript of "elem" in the state array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
-    (
-    state       float8[],
-    elem        float8,
-    elem_cnt    int8,
-    elem_idx    int8
-    )
-RETURNS float8[]
-AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The Pre-function of the aggregate __array_indexed_agg.
- *
- * @param arg0  The first state array.
- * @param arg1  The second state array.
- *
- * @return The combined state.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
-    (
-    float8[],
-    float8[]
-    )
-RETURNS float8[]
-AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The final function of __array_indexed_agg.
- *
- * @param state  The state array.
- *
- * @return The aggregate result.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
-    (
-    float8[]
-    )
-RETURNS float8[]
-AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The aggregate is the same with array_agg, which will accumulate
- *        The elements in each group to an array, except that we allow users
- *        provide the subscript for each element. This aggregate will be
- *        invoked as HashAggregate, while array_agg will be called as
- *        GroupAggregate. Therefore, our implementation have better performance
- *        than the array_agg.
- *
- * @param elem     The element to be fed into the returned array of this aggregate.
- * @param elem_cnt The number of elements.
- * @param elem_idx The subscript of the element.
- *
- * @return The aggregated array.
- *
- */
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8);
-CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
-    SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
-    m4_ifdef( `__POSTGRESQL__', `', `PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
-    FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
-    STYPE = float8[]
-);
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
-    (
-    count_array         BIGINT[],
-    num_of_class        INT,
-    count               BIGINT,
-    class               INT
-    )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
-LANGUAGE C VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__dt_acc_count_aggr (INT, BIGINT, INT);
-CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
-    (
-    INT,
-    BIGINT,
-    INT
-    )
-(
-  SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
-  m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
-  STYPE=BIGINT[]
-);
-
-
-/*
- * @brief The aggregate is created for the PostgreSQL, which doesn't support the
- *        function sum over an array.
- *
- * @param elem     The element to be fed into the returned array of this aggregate.
- *
- * @return The array with the sum of all the input array in a group.
- *
- */
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__bigint_array_sum (BIGINT[]);
-CREATE AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
-    (
-    BIGINT[]
-    )
-(
-  SFUNC=MADLIB_SCHEMA.__bigint_array_add,
-  m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
-  STYPE=BIGINT[]
-);
-
-
-/*
- * @brief This function find the best split and return the information.
- *
- * @param table_name          The name of the table containing the training
- *                            set.
- * @param confidence_level    This parameter is used by the 'Error-Based Pruning'.
- *                            Please refer to the paper for detailed definition.
- *                            The paper's name is 'Error-Based Pruning of Decision
- *                            Trees Grown on Very Large Data Sets Can Work!'.
- * @param feature_table_name  Is is the name of one internal table, which contains
- *                            meta data for each feature.
- * @param split_criterion     It defines the split criterion to be used.
- *                            (1- information gain. 2- gain ratio. 3- gini).
- * @param continue_grow       It specifies whether we should still grow the tree
- *                            on the selected branch.
- * @param output_table        It specifies the table used to store the chosen splits.
- * @param h2hmv_routine_id    Specifies how to handle missing values.
- *                            1 ignore, 2 explicit.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
-    (
-    table_name              TEXT,
-    confidence_level        FLOAT,
-    feature_table_name      TEXT,
-    split_criterion         INT,
-    continue_grow           INT,
-    output_table            TEXT,
-    h2hmv_routine_id        INT,
-    num_classes             INT
-    )
-RETURNS VOID AS $$
-DECLARE
-    total_size         INT;
-    curstmt            TEXT := '';
-    begin_func_exec    TIMESTAMP;
-    select_stmt        TEXT;
-BEGIN
-    begin_func_exec = clock_timestamp();
-
-    m4_changequote(`<!', `!>')
-    IF (h2hmv_routine_id=1) THEN
-        -- For ignore, we need the true size of nodes to handle the missing values.
-        select_stmt =
-           'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
-            FROM
-            (
-                SELECT tid, nid, fid,
-                m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total
-                FROM training_instance_aux
-                GROUP BY tid, nid, fid
-            ) t1 INNER JOIN node_size_aux t2
-            ON t1.tid=t2.tid AND t1.nid=t2.nid';
-    ELSE
-        -- For explicit, the calculated node size from the aggregation is correct.
-        -- We can set NULL, which denotes we can safely use the counted value.
-        select_stmt =
-           'SELECT tid, nid, fid,
-            m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total,
-            NULL::BIGINT AS node_size
-            FROM training_instance_aux
-            GROUP BY tid, nid, fid';
-    END IF;
-
-    /*
-     * This table is used to store information for the calculated best split
-     *
-     * tid                  The ID of the tree.
-     * node_id              The ID of one node in the specified tree.
-     * feature              The ID of the selected feature.
-     * probability          The predicted probability of our chosen class.
-     * max_class            The ID of the class chosen by the algorithm.
-     * max_scv              The maximum split criterion value.
-     * live                 1- For the chosen split, we should split further.
-     *                      0- For the chosen split, we shouldn't split further.
-     * ebp_coeff            total error for error-based pruning.
-     * is_cont              whether the selected feature is continuous.
-     * split_value          If the selected feature is continuous, it specifies
-     *                      the split value. Otherwise, it is of no use.
-     * distinct_features    The number of distinct values for the selected feature.
-     * node_size            The size of this tree node.
-     *
-     */
-    EXECUTE 'DROP TABLE IF EXISTS '||output_table;
-    EXECUTE 'CREATE TEMP TABLE '||output_table||'
-    (
-        tid                 INT,
-        node_id             INT,
-        feature             INT,
-        probability         FLOAT,
-        max_class           INTEGER,
-        max_scv             FLOAT,
-        live                INT,
-        ebp_coeff           FLOAT,
-        is_cont             BOOLEAN,
-        split_value         FLOAT,
-        distinct_features   INT,
-        node_size           INT
-    ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (node_id)!>);';
-
-
-    EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';
-
-    SELECT MADLIB_SCHEMA.__format
-        (
-        'INSERT INTO %
-         SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
-                CASE WHEN (best_scv[1] < 1e-9   OR
-                           best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
-                        0
-                ELSE
-                        1
-                END AS live,
-                MADLIB_SCHEMA.__ebp_calc_errors
-                    (best_scv[5], best_scv[4], %) AS ebp_coeff,
-                o2.is_cont,
-                CASE WHEN( o2.is_cont ) THEN
-                    best_scv[7]
-                ELSE
-                    NULL
-                END AS split_value,
-                o2.num_dist_value, best_scv[5]
-        FROM
-        (
-            SELECT s1.tid, s1.nid,
-                MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
-                    coalesce(s1.split_value,0)) as best_scv
-            FROM (
-                SELECT t1.tid, t1.nid, t1.fid, split_value,
-                        MADLIB_SCHEMA.__scv_aggr
-                            (%, is_cont, %, le, total, t2.node_size) AS scv
-                FROM
-                    (
-                        SELECT tid, nid, fid, fval, is_cont,
-                        CASE WHEN (is_cont) THEN
-                           fval
-                        ELSE
-                            NULL::FLOAT8
-                        END AS split_value,
-                        CASE WHEN (is_cont) THEN
-                                m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) OVER
-                                    (PARTITION BY tid, nid, fid ORDER BY fval
-                                     ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
-                        ELSE
-                                count
-                        END AS le
-                        FROM training_instance_aux
-                    ) t1,
-                    (
-                        %
-                    ) t2
-                WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
-                GROUP BY t1.tid, t1.nid, t1.fid, split_value
-            ) s1
-            GROUP BY s1.tid, s1.nid
-        ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',
-            ARRAY[
-                output_table,
-                continue_grow::TEXT,
-                confidence_level::TEXT,
-                split_criterion::TEXT,
-                num_classes::TEXT,
-                select_stmt,
-                feature_table_name
-            ]
-        ) INTO curstmt;
-
-    EXECUTE curstmt;
-    m4_changequote(<!`!>, <!'!>)
-
-    RETURN;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief For training one decision tree, we need some internal tables
- *        to store intermediate results. This function creates those
- *        tables. Moreover, this function also creates the tree table
- *        specified by user.
- *
- * @param result_tree_table_name  The name of the tree specified by user.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
-    (
-    result_tree_table_name TEXT
-    )
-RETURNS void AS $$
-BEGIN
-    --  The table of node_size_aux records the size of each node. It is used
-    --  for missing value handling.
-    DROP TABLE IF EXISTS node_size_aux CASCADE;
-    CREATE TEMP TABLE node_size_aux
-    (
-        tid             INT,
-        nid             INT,
-        node_size       INT
-    )m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (tid,nid)');
-
-    -- The table below stores the decision tree information just constructed.
-    -- Columns:
-    --      id:             The ID of the node represented by this row. Tree
-    --                      node IDs are unique across all trees. The IDs of
-    --                      all children of a node is made to be continuous.
-    --      tree_location:  An array containing the encoded values of all the
-    --                      features on the path from the root node to the
-    --                      current node. For the root node, the location
-    --                      value is {0}.
-    --      feature:        The ID of the best split feature chosen for the
-    --                      node represented by this row.
-    --      probability:    If forced to make a call for a dominant class
-    --                      at a given point this would be the confidence of the
-    --                      call (this is only an estimated value).
-    --      ebp_coeff:      The total errors used by error based pruning (ebp)
-    --                      based on the specified confidence level. RF does
-    --                      not do EBP therefore for RF nodes, this column always
-    --                      contains 1.
-    --      max_class:      If forced to make a call for a dominant class
-    --                      at a given point this is the selected class.
-    --      scv:            The splitting criteria value (scv) computed at this node.
-    --      live:           Specifies whether the node should be further split
-    --                      or not. A positive value indicates further split of
-    --                      the node represented by this row is needed.
-    --      num_of_samples: The number of samples at this node.
-    --      parent_id:      Id of the parent branch.
-    --      lmc_nid:        Leftmost child (lmc) node id of the node represented
-    --                      by the current row.
-    --      lmc_fval:       The feature value which leads to the lmc node.
-    --                      An example of getting all the child nodes' ids
-    --                      and condition values
-    --                      1. Get the right most node id
-    --                      SELECT DISTINCT ON(parent_id) id FROM tree_table
-    --                      WHERE parent_id = $pid ORDER BY parent_id, id desc
-    --                      INTO max_child_nid;
-    --                      2. Get child nodes' ids and condition values by a
-    --                         while loop
-    --                      node_count = 1;
-    --                      WHILE (lmc_nid IS NOT NULL) AND
-    --                          (0 < node_count AND lmc_nid <= max_child_nid) LOOP
-    --                          ...
-    --                          lmc_nid  = lmc_nid  + 1;
-    --                          lmc_fval = lmc_fval + 1;
-    --                          SELECT COUNT(id) FROM tree_table
-    --                          WHERE id = $lmc_nid AND parent_id = $pid
-    --                          INTO node_count;
-    --                      END LOOP;
-    --      is_cont:        It specifies whether the selected feature is a
-    --                      continuous feature.
-    --      split_value:    For continuous feature, it specifies the split value.
-    --                      Otherwise, it is of no meaning and fixed to 0.
-    --      tid:            The id of a tree that this node belongs to.
-    --      dp_ids:         An array containing the IDs of the non-continuous
-    --                      features chosen by all ancestors nodes (starting
-    --                      from the root) for splitting.
-    --
-    -- The table below stores the final decision tree information.
-    -- It is an the table specified by users.
-    -- Please refer the table above for detailed column definition.
-    m4_changequote(`<!', `!>')
-    EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
-    EXECUTE 'CREATE TABLE '||result_tree_table_name||'
-    (
-        id              INT,
-        tree_location   INT[],
-        feature         INT,
-        probability     FLOAT,
-        ebp_coeff       FLOAT,
-        max_class       INTEGER,
-        scv             FLOAT,
-        live            INT,
-        num_of_samples  INT,
-        parent_id       INT,
-        lmc_nid         INT,
-        lmc_fval        INT,
-        is_cont         BOOLEAN,
-        split_value     FLOAT,
-        tid             INT,
-        dp_ids          INT[]
-    ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (tid,id)!>);';
-    m4_changequote(<!`!>, <!'!>)
-
-    -- The following table stored the auxiliary information for updating the
-    -- association table, so that the updating operation only need to
-    -- join the encoded table with association table once
-    EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
-    CREATE TEMP TABLE assoc_aux
-    (
-        nid         INT,
-        fid         INT,
-        lmc_id      INT,
-        svalue      FLOAT,
-        is_cont     BOOL
-    ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (nid)');
-
-    EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
-    EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
-    EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <!
-    CREATE TEMP TABLE tr_assoc_ping
-    (
-        id      BIGINT ENCODING (compresstype=RLE_TYPE),
-        nid     INT    ENCODING (compresstype=RLE_TYPE),
-        tid     INT    ENCODING (compresstype=RLE_TYPE),
-        weight  INT    ENCODING (compresstype=RLE_TYPE)
-    )
-    WITH(appendonly=true, orientation=column)
-    DISTRIBUTED BY(id);
-
-    CREATE TEMP TABLE tr_assoc_pong
-    (
-        id      BIGINT ENCODING (compresstype=RLE_TYPE),
-        nid     INT    ENCODING (compresstype=RLE_TYPE),
-        tid     INT    ENCODING (compresstype=RLE_TYPE),
-        weight  INT    ENCODING (compresstype=RLE_TYPE)
-    )
-    WITH(appendonly=true, orientation=column)
-    DISTRIBUTED BY(id);
-
-    CREATE TEMP TABLE sf_assoc
-    (
-        nid     INT    ENCODING (compresstype=RLE_TYPE),
-        fid     INT    ENCODING (compresstype=RLE_TYPE)
-    )
-    WITH(appendonly=true, orientation=column)
-    DISTRIBUTED BY(fid);
-!>, <!
-    CREATE TEMP TABLE tr_assoc_ping
-    (
-        id      BIGINT,
-        nid     INT,
-        tid     INT,
-        weight  INT
-    )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
-    CREATE TEMP TABLE tr_assoc_pong
-    (
-        id      BIGINT,
-        nid     INT,
-        tid     INT,
-        weight  INT
-    )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
-    CREATE TEMP TABLE sf_assoc
-    (
-        nid     INT,
-        fid     INT
-    )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid)!>);
-!>)
-m4_changequote(<!`!>, <!'!>)
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
- *
- * @param tree_table_name   The name of the table containing the tree.
- * @param validation_table  The name of the table containing validation set.
- * @param max_num_classes   The count of different classes.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
-    (
-    tree_table_name     TEXT,
-    validation_table    TEXT,
-    max_num_classes     INT
-    )
-RETURNS void AS $$
-DECLARE
-    num_parent_ids          INTEGER;
-    cf_table_name           TEXT;
-    encoded_table_name      TEXT;
-    metatable_name          TEXT;
-    curstmt                 TEXT;
-    class_col_name          TEXT;
-    classify_result         TEXT;
-    temp_text               TEXT;
-    n                       INT;
-    table_names             TEXT[];
-    swap_tree_table         TEXT;
-BEGIN
-    metatable_name = tree_table_name || '_di';
-    class_col_name  = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
-
-    -- the value of class column in validation table must in the KV table
-    SELECT MADLIB_SCHEMA.__format
-        (
-        'SELECT COUNT(*)
-         FROM %
-         WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
-            (SELECT fval FROM % WHERE fval IS NOT NULL)',
-        ARRAY[
-            validation_table,
-            class_col_name,
-            MADLIB_SCHEMA.__get_classtable_name(metatable_name)
-        ]
-        )
-    INTO curstmt;
-
-    EXECUTE curstmt INTO n;
-
-    PERFORM MADLIB_SCHEMA.__assert
-            (
-                n = 0,
-                'the value of class column in validation table must in
-                 training table'
-            );
-
-    table_names = MADLIB_SCHEMA.__treemodel_classify_internal
-                  (
-                    validation_table,
-                    tree_table_name,
-                    0
-                  );
-
-    encoded_table_name = table_names[1];
-    classify_result    = table_names[2];
-    cf_table_name      = classify_result;
-
-    -- after encoding in classification, class_col_name is fixed to class
-    class_col_name  = 'class';
-
-    m4_changequote(`<!', `!>')
-    m4_ifdef(<!__HAWQ__!>, <!
-        EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
-        EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
-                 classify_result ||
-                 ' LIMIT 0 m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)';
-    !>)
-    m4_changequote(<!`!>, <!'!>)
-
-    LOOP
-        DROP TABLE IF EXISTS selected_parent_ids_rep;
-        CREATE TEMP TABLE selected_parent_ids_rep
-        (
-            parent_id BIGINT,
-            max_class  INT
-        ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)');
-
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'INSERT INTO selected_parent_ids_rep
-                SELECT parent_id, t.g[1] as max_class
-                FROM
-                (
-                    SELECT parent_id,
-                           MADLIB_SCHEMA.__rep_aggr_class_count
-                               (
-                               c.class,
-                               s.%,
-                               %
-                               ) AS g
-                    FROM % c, % s
-                    WHERE c.id=s.id
-                    GROUP BY parent_id
-                ) t
-                WHERE t.g[2] >= 0 AND
-                      t.parent_id IN
-                      (
-                          Select parent_id FROM %
-                          WHERE parent_id NOT IN
-                              (
-                                  Select parent_id
-                                  FROM %
-                                  WHERE lmc_nid IS NOT NULL
-                              ) and id <> 1
-                      );',
-                  ARRAY[
-                      class_col_name,
-                      MADLIB_SCHEMA.__to_char(max_num_classes),
-                      classify_result,
-                      encoded_table_name,
-                      tree_table_name,
-                      tree_table_name
-                  ]
-              )
-              INTO curstmt;
-
-        EXECUTE curstmt;
-
-        EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'
-            INTO num_parent_ids;
-        IF (num_parent_ids IS NULL)  THEN
-            EXIT;
-        END IF;
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
-        -- for some databases, update operation can't distribute data across segments
-        -- we use two tables to update the data
-        IF (classify_result = 'tree_rep_pong') THEN
-            temp_text = cf_table_name;
-        ELSE
-            temp_text =  'tree_rep_pong';
-        END IF;
-
-        EXECUTE 'TRUNCATE ' ||  temp_text;
-        SELECT MADLIB_SCHEMA.__format
-            (
-            'INSERT INTO %(id, class, parent_id, leaf_id)
-             SELECT m.id,  t.max_class, t.parent_id, t.id
-             FROM % m, % t
-             WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
-             m.parent_id = t.id',
-            ARRAY[
-                temp_text,
-                classify_result,
-                tree_table_name
-            ]
-            )
-        INTO curstmt;
-
-        EXECUTE curstmt;
-
-        classify_result = temp_text;
-!>, <!
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'UPDATE % m set class = t.max_class,
-                 parent_id = t.parent_id,leaf_id = t.id
-                 FROM % t
-                 WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
-                 m.parent_id=t.id',
-                classify_result,
-                tree_table_name
-            )
-        INTO curstmt;
-        EXECUTE curstmt;
-!>)
-
-m4_ifdef(<!__HAWQ__!>, <!
-        SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || swap_tree_table || ';
-            CREATE TABLE ' || swap_tree_table || ' AS
-            SELECT * FROM ' || tree_table_name || '
-            WHERE id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)
-            AND parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';
-
-        EXECUTE '
-            INSERT INTO ' || swap_tree_table || '
-            SELECT
-                t1.id, t1.tree_location, t1.feature, t1.probability,
-                t1.ebp_coeff, t2.max_class, t1.scv, t1.live, t1.num_of_samples,
-                t1.parent_id, NULL, NULL, t1.is_cont, t1.split_value, t1.tid,
-                t1.dp_ids
-            FROM ' || tree_table_name || ' t1 inner join selected_parent_ids_rep t2
-            ON t1.id = t2.parent_id
-            WHERE t1.parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';
-
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || tree_table_name;
-        PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
-!>, <!
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'DELETE FROM % WHERE parent_id IN
-                 (SELECT parent_id FROM selected_parent_ids_rep)',
-                tree_table_name
-            )
-            INTO curstmt;
-
-        EXECUTE curstmt;
-
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'UPDATE % t1 SET lmc_nid = NULL,
-                 lmc_fval = NULL, max_class = t2.max_class
-                 FROM selected_parent_ids_rep t2
-                 WHERE t1.id = t2.parent_id;',
-                tree_table_name
-            )
-            INTO curstmt;
-
-        EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
-    END LOOP;
-
-    EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief Calculates the total errors used by Error Based Pruning (EBP).
- *
- * @param total             The number of total samples represented by the node
- *                          being processed.
- * @param prob              The probability to mis-classify samples represented by the
- *                          child nodes if they are pruned with EBP.
- * @param confidence_level  A certainty factor to calculate the confidence limits
- *                          for the probability of error using the binomial theorem.
- *
- * @return The computed total error.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
-    (
-    total               FLOAT8,
-    prob                FLOAT8,
-    confidence_level    FLOAT8
-    ) RETURNS FLOAT8
-AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief Prune the trained tree with "Error-based Pruning" algorithm.
- *
- * @param tree_table_name  The name of the table containing the tree.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
-    (
-    tree_table_name TEXT
-    )
-RETURNS void AS $$
-DECLARE
-    num_parent_ids INTEGER;
-    curstmt TEXT;
-    swap_tree_table TEXT;
-BEGIN
-    LOOP
-        DROP TABLE IF EXISTS selected_parent_ids_ebp;
-        CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
-            m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(parent_id)');
-
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'INSERT INTO selected_parent_ids_ebp
-                SELECT s.parent_id as parent_id
-                FROM
-                (
-                    Select parent_id, sum(ebp_coeff) as ebp_coeff
-                    FROM
-                    (
-                        Select parent_id, ebp_coeff
-                        FROM %
-                        WHERE parent_id NOT IN
-                            (
-                            Select parent_id  FROM % WHERE lmc_nid IS NOT NULL
-                            )  and id <> 1
-                    ) m
-                    GROUP BY m.parent_id
-                 ) s
-                 LEFT JOIN  % p
-                    ON p.id = s.parent_id
-                 WHERE  p.ebp_coeff < s.ebp_coeff;',
-                 tree_table_name,
-                 tree_table_name,
-                 tree_table_name
-            )
-            INTO curstmt;
-
-        EXECUTE curstmt;
-
-        EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
-                 INTO num_parent_ids;
-
-        IF (num_parent_ids IS NULL)  THEN
-            EXIT;
-        END IF;
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
-        SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || swap_tree_table || ';
-            CREATE TABLE ' || swap_tree_table || ' AS
-            SELECT * FROM ' || tree_table_name || '
-            WHERE parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)
-                AND id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';
-
-        EXECUTE '
-            INSERT INTO ' || swap_tree_table || '
-            SELECT
-                id, tree_location, feature, probability, ebp_coeff, max_class,
-                scv, live, num_of_samples, parent_id, NULL, NULL, is_cont,
-                split_value, tid, dp_ids
-            FROM ' || tree_table_name || '
-            WHERE
-                id IN (SELECT parent_id FROM selected_parent_ids_ebp) AND
-                parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';
-
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || tree_table_name;
-        PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
-!>, <!
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'DELETE FROM %
-                WHERE parent_id IN
-                    (SELECT parent_id FROM selected_parent_ids_ebp)',
-                tree_table_name
-            )
-            INTO curstmt;
-
-        EXECUTE curstmt;
-
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'UPDATE %
-                SET lmc_nid = NULL, lmc_fval = NULL
-                WHERE id IN
-                    (SELECT parent_id FROM selected_parent_ids_ebp)',
-                tree_table_name
-            )
-            INTO curstmt;
-
-        EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
-    END LOOP;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief Generate the final trained tree.
- *
- * @param result_tree_table_name  The name of the table containing the tree.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
-    (
-    result_tree_table_name TEXT
-    )
-RETURNS void AS $$
-DECLARE
-    tree_size           INTEGER;
-    curstmt             TEXT;
-    num_redundant_nodes INTEGER;
-    swap_tree_table     TEXT;
-BEGIN
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
-    SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
-    EXECUTE '
-        DROP TABLE IF EXISTS ' || swap_tree_table || ';
-        CREATE TABLE ' || swap_tree_table || ' AS
-        SELECT * FROM ' || result_tree_table_name || '
-        WHERE COALESCE(num_of_samples,0) != 0
-            AND id NOT IN (SELECT parent_id FROM ' || result_tree_table_name || ' GROUP BY parent_id)';
-
-    EXECUTE '
-        INSERT INTO ' || swap_tree_table || '
-        SELECT
-            k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
-            k.scv, k.live, k.num_of_samples, k.parent_id, g.lmc_nid, g.lmc_fval, k.is_cont,
-            k.split_value, k.tid, k.dp_ids
-        FROM ' || result_tree_table_name || ' k INNER JOIN
-        (
-            SELECT parent_id,
-                   min(id) as lmc_nid,
-                   min(tree_location[array_upper(tree_location,1)])
-                   as lmc_fval
-            FROM ' || result_tree_table_name || '
-            GROUP BY parent_id
-        ) g
-        ON k.id = g.parent_id
-        WHERE COALESCE(k.num_of_samples, 0) != 0';
-
-    EXECUTE '
-        DROP TABLE IF EXISTS ' || result_tree_table_name;
-    PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
-    EXECUTE ' DELETE FROM ' || result_tree_table_name ||
-            ' WHERE COALESCE(num_of_samples,0) = 0';
-
-    -- for each node, find the left most child node id and the feature value,
-    -- and update the node's lmc_nid and lmc_fval column
-    SELECT MADLIB_SCHEMA.__format
-            (
-                'UPDATE % k
-                 SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
-                 FROM
-                    (
-                    SELECT parent_id,
-                           min(id) as lmc_nid,
-                           min(tree_location[array_upper(tree_location,1)])
-                           as lmc_fval
-                    FROM %
-                    GROUP BY parent_id
-                    ) g
-                WHERE k.id = g.parent_id',
-                ARRAY[
-                    result_tree_table_name,
-                    result_tree_table_name
-                    ]
-            )
-    INTO curstmt;
-    EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
-    /*
-     *  For a certain node, if all of its children are leaf nodes and have the
-     *  same class label, we can safely remove its children. After removal, we
-     *  should apply the same operation to the new leaf nodes until no nodes
-     *  meet this criterion.
-     */
-    LOOP
-        EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
-        -- Find nodes whose children should be removed.
-        curstmt = MADLIB_SCHEMA.__format
-            (
-            'CREATE TEMP TABLE trim_tree_aux_table AS
-            SELECT parent_id FROM
-            (
-                SELECT parent_id, count(distinct max_class) as class_count
-                FROM %
-                WHERE parent_id IN
-                    (
-                    SELECT parent_id FROM %
-                    WHERE parent_id NOT IN
-                        (
-                            SELECT parent_id
-                            FROM %
-                            WHERE lmc_nid IS NOT NULL
-                        ) and parent_id <> 0
-                    )
-                GROUP BY parent_id
-            ) l
-            where l.class_count=1
-            m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)')',
-            ARRAY[
-                result_tree_table_name,
-                result_tree_table_name,
-                result_tree_table_name
-                ]
-            );
-        EXECUTE curstmt;
-
-        EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'
-            INTO num_redundant_nodes;
-
-        IF (num_redundant_nodes <= 0) THEN
-            EXIT;
-        END IF;
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
-        SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || swap_tree_table || ';
-            CREATE TABLE ' || swap_tree_table || ' AS
-            SELECT * FROM ' || result_tree_table_name || '
-            WHERE parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)
-                AND id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';
-
-        EXECUTE '
-            INSERT INTO ' || swap_tree_table || '
-            SELECT
-                k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
-                k.scv, k.live, k.num_of_samples, k.parent_id, NULL, NULL, k.is_cont,
-                k.split_value, k.tid, k.dp_ids
-            FROM ' || result_tree_table_name || ' k INNER JOIN
-            (
-                SELECT parent_id FROM trim_tree_aux_table
-            ) g
-            ON k.id = g.parent_id
-            WHERE k.parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';
-
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || result_tree_table_name;
-        PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
-        -- Delete the found redundant nodes.
-        curstmt = MADLIB_SCHEMA.__format
-            (
-            '
-            DELETE FROM % t
-            WHERE t.parent_id IN
-            (SELECT parent_id FROM trim_tree_aux_table)',
-            ARRAY[
-                result_tree_table_name
-                ]
-            );
-        EXECUTE curstmt;
-
-        -- Set the nodes, whose children are removed, to be leaf nodes.
-        curstmt =  MADLIB_SCHEMA.__format
-                (
-                'UPDATE % k
-                 SET lmc_nid = NULL, lmc_fval = NULL
-                 FROM
-                    (
-                    SELECT parent_id FROM trim_tree_aux_table
-                    ) g
-                 WHERE k.id = g.parent_id',
-                ARRAY[
-                    result_tree_table_name
-                    ]
-                );
-        EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-    END LOOP;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * The UDT for the training result.
- *
- *      num_of_samples           It means how many records there exists in the
- *                               training set.
- *      features_per_node        The number of features chosen for each tree.
- *      num_tree_nodes           The number of tree nodes.
- *      max_tree_depth           The max tree depth.
- *      calc_acc_time            Total time of calculating acc.
- *      calc_pre_time            Time of preprocessing when calculating acc.
- *      update_time              Total time of updating operation after found
- *                               the best time.
- *      update_best              Time of updating the best splits' information.
- *      update_child             Time of generating the child nodes.
- *      update_nid               Time of updating the assigned node IDs.
- *      scv_acs_time             Time of calculating the best splits.
- *      prune_time               Time of tree pruning.
- *
- */
-DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result CASCADE;
-CREATE TYPE MADLIB_SCHEMA.__train_result AS
-(
-    num_of_samples           BIGINT,
-    features_per_node        INT,
-    num_tree_nodes           INT,
-    max_tree_depth           INT,
-    calc_acc_time            INTERVAL,
-    calc_pre_time            INTERVAL,
-    update_time              INTERVAL,
-    update_best              INTERVAL,
-    update_child             INTERVAL,
-    update_nid               INTERVAL,
-    scv_acs_time             INTERVAL,
-    prune_time               INTERVAL
-);
-
-
-/*
- * @brief The function samples a set of integer values between low and high.
- *
- * @param num_of_samples  The number of records to be sampled.
- * @param low             The low limit of sampled values.
- * @param high            The high limit of sampled values.
- *
- * @return A set of integer values sampled randomly between [low, high].
- *
- */
-DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
-    (
-    BIGINT,
-    BIGINT,
-    BIGINT
-    )CASCADE;
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
-    (
-    num_of_samples      BIGINT,
-    low                 BIGINT,
-    high                BIGINT
-    )
-RETURNS SETOF BIGINT
-AS 'MODULE_PATHNAME', 'dt_sample_within_range'
-LANGUAGE C STRICT VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The function samples with replacement from source table and store
- *        the results to target table.
- *
- *        In this function, we firstly calculate how many samples should be
- *        generated in each segment. Then, we let those segments sample with
- *        replacement between the maximum ID and minimum ID of the source table
- *        in parallel and assign samples to different trees.
- *
- *        If there are gaps in the ID column of the source table, we sample
- *        extra records in proportion to the number of gaps. At last, we remove
- *        these invalid samples with an inner join operation with the source
- *        table. Since we target big data, this strategy works quite well.
- *
- * @param num_of_tree     The number of trees to be trained.
- * @param size_per_tree   The number of records to be sampled for each tree.
- * @param src_table       The name of the table to be sampled from.
- * @param target_table    The name of the table used to store the results.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
-    (
-    num_of_tree     INT,
-    size_per_tree   BIGINT,
-    src_table       TEXT,
-    target_table    TEXT
-    )
-RETURNS VOID AS $$
-DECLARE
-    segment_num     INT;
-    sample_per_seg  BIGINT;
-    sample_ratio    FLOAT8;
-    record_num      FLOAT8;
-    min_id          BIGINT;
-    max_id          BIGINT;
-    range           FLOAT8;
-    stmt            TEXT;
-BEGIN
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__POSTGRESQL__!>, <!
-    -- fix the segment number to 1 for PG
-    segment_num = 1;
-!>, <!
-    -- get the segment number
-    SELECT COUNT(distinct content) FROM gp_segment_configuration
-        WHERE content<>-1 INTO segment_num;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
-
-    DROP TABLE IF EXISTS auxiliary_segment_table;
-    CREATE TEMP TABLE auxiliary_segment_table
-    (
-        segment_id  INT
-    ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(segment_id)');
-
-    -- Insert segment_num of records distributed by segment id
-    EXECUTE 'INSERT INTO auxiliary_segment_table
-        SELECT generate_series(1,'||segment_num||');';
-
-    EXECUTE 'SELECT max(id),min(id), count(id) as record_num
-        FROM '||src_table||';' INTO max_id,min_id,record_num;
-    range=max_id-min_id+1;
-
-    -- compute the sample ratio
-    sample_ratio= range/record_num;
-
-    -- compute how many records should be sampled by each segment
-    sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT;
-
-    -- add the weight field
-
-    IF (range > record_num) THEN
-        -- remove those invalid samples with join operation
-        stmt = MADLIB_SCHEMA.__format
-            (
-            'INSERT INTO %(id, tid, nid, weight)
-              SELECT record_id,
-                     tid AS tid,
-                     tid AS nid,
-                     count(*) AS weight
-              FROM
-                (
-                    SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
-                            MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
-                    FROM auxiliary_segment_table
-                ) t,
-                % k
-              WHERE t.record_id=k.id
-              GROUP BY record_id, tid, nid',
-            ARRAY[
-                target_table,
-                sample_per_seg::TEXT,
-                min_id::TEXT,
-                max_id::TEXT,
-                sample_per_seg::TEXT,
-                num_of_tree::TEXT,
-                src_table
-            ]
-            );
-    ELSE
-        stmt = MADLIB_SCHEMA.__format
-            (
-            'INSERT INTO %(id, tid, nid, weight)
-              SELECT record_id,
-                     tid AS tid,
-                     tid AS nid,
-                     count(*) AS weight
-              FROM
-                (
-                    SELECT  MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
-                            MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
-                    FROM auxiliary_segment_table
-                ) t
-              GROUP BY record_id, tid, nid',
-            ARRAY[
-                target_table,
-                sample_per_seg::TEXT,
-                min_id::TEXT,
-                max_id::TEXT,
-                sample_per_seg::TEXT,
-                num_of_tree::TEXT
-            ]
-            );
-    END IF;
-
-    EXECUTE stmt;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief This function trains a decision tree or random forest.
- *
- * @param split_criterion             This parameter specifies which split criterion
- *                                    should be used for tree construction and
- *                                    pruning. The valid values are infogain,
- *                                    gainratio, and gini.
- * @param num_trees                   Total number of trees to be trained.
- * @param features_per_node           Total number of features used to compute split
- *                                    gain for each node.
- * @param training_table_name         The name of the table/view with the source data.
- * @param training_table_meta         The name of the table with the meta data.
- * @param result_tree_table_name      The name of the table where the resulting
- *                                    DT/RF will be stored.
- * @param validation_table_name       The validation table used for pruning tree.
- * @param id_col_name                 The name of the column containing id of each point.
- * @param class_col_name              The name of the column containing correct class
- *                                    of each point.
- * @param confidence_level            A statistical confidence interval of the
- *                                    resubstitution error.
- * @param max_tree_depth              Maximum decision tree depth.
- * @param node_prune_threshold        Specifies the minimum number of samples required
- *                                    in a child node.
- * @param node_split_threshold        Specifies the minimum number of samples required
- *                                    in a node in order for a further split
- *                                    to be possible.
- * @param sampling_needed             Whether enabling the sampling functionality.
- * @param h2hmv_routine_id            Specifies how to handle missing values.
- *                                    1 ignore, 2 explicit.
- * @param verbosity                   > 0 means this function runs in verbose mode.
- *
- * @return The record including training related information.
- *         Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
-    (
-    split_criterion         TEXT,
-    num_trees               INT,
-    features_per_node       INT,
-    training_table_name     TEXT,
-    training_table_meta     TEXT,
-    result_tree_table_name  TEXT,
-    validation_table_name   TEXT,
-    id_col_name             TEXT,
-    class_col_name          TEXT,
-    confidence_level        FLOAT,
-    max_tree_depth          INT,
-    sampling_percentage     FLOAT,
-    node_prune_threshold    FLOAT,
-    node_split_threshold    FLOAT,
-    sampling_needed         BOOLEAN,
-    h2hmv_routine_id        INT,
-    verbosity               INT
-    )
-RETURNS MADLIB_SCHEMA.__train_result AS $$
-DECLARE
-    num_live_nodes              INT;
-    max_nid                     INT;
-    location                    INT[];
-    temp_location               INT[];
-    num_classes                 INT;
-    answer                      record;
-    location_size               INT;
-    begin_func_exec             TIMESTAMP;
-    begin_find_best             TIMESTAMP;
-    scv_acs_time                INTERVAL;
-    begin_data_transfer         TIMESTAMP;
-    begin_update_best           TIMESTAMP;
-    begin_update_child          TIMESTAMP;
-    begin_update_nid            TIMESTAMP;
-    calc_update_best            INTERVAL;
-    calc_update_child           INTERVAL;
-    calc_update_nid             INTERVAL;
-    ins_upd_time                INTERVAL;
-    begin_olap_acs              TIMESTAMP;
-    calc_acc_time               INTERVAL;
-    calc_pre_time               INTERVAL;
-    calc_olap_time              INTERVAL;
-    begin_bld_assoc             TIMESTAMP;
-    bld_assoc_time              INTERVAL;
-    begin_prune                 TIMESTAMP;
-    prune_time                  INTERVAL;
-    total_size                  FLOAT;
-    sc_code                     INT := 1;
-    curstmt                     TEXT := '';
-    grow_tree                   INT := max_tree_depth;
-    ret                         MADLIB_SCHEMA.__train_result;
-    curr_level                  INT := 1;
-    dp_ids                      INT[];
-    dp_ids_text                 TEXT;
-    instance_time               MADLIB_SCHEMA.__gen_acc_time;
-    tr_table_index              INT := 1;
-    tr_tables                   TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
-    cur_tr_table                TEXT := 'tr_assoc_ping';
-    need_analyze                BOOL := 't'::BOOL;
-    attr_count                  INT;
-    swap_tree_table             TEXT;
-    tree_tbl_rows               TEXT;
-BEGIN
-    -- record the time costed in different steps when training
-    begin_func_exec     = clock_timestamp();
-    scv_acs_time        = begin_func_exec - begin_func_exec;
-    calc_olap_time      = scv_acs_time;
-    calc_acc_time       = scv_acs_time;
-    calc_pre_time       = scv_acs_time;
-    ins_upd_time        = scv_acs_time;
-    calc_update_best    = scv_acs_time;
-    calc_update_child   = scv_acs_time;
-    calc_update_nid     = scv_acs_time;
-    bld_assoc_time      = scv_acs_time;
-    prune_time          = scv_acs_time;
-
-    IF(split_criterion = 'infogain') THEN
-        sc_code = 1;
-    ELSIF (split_criterion = 'gainratio') THEN
-        sc_code = 2;
-    ELSIF (split_criterion = 'gini') THEN
-        sc_code = 3;
-    ELSE
-        RAISE EXCEPTION '%', 'Invalid split criterion!';
-    END IF;
-
-    num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
-
-    IF(verbosity > 0) THEN
-        RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
-    END IF;
-
-    IF(num_classes < 2) THEN
-        RAISE EXCEPTION 'the number of classes must be greater than 2';
-    END IF;
-
-    curstmt = MADLIB_SCHEMA.__format
-        (
-            'SELECT
-                count(*)
-             FROM %
-             WHERE column_type=''f''',
-            training_table_meta
-        );
-    EXECUTE curstmt INTO attr_count;
-
-    -- generate the horizontal table for updating assinged node IDs
-    PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
-        (
-            'tmp_dt_hori_table',
-            training_table_name,
-            attr_count,
-            verbosity
-        );
-
-    EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;
-
-    IF(verbosity > 0) THEN
-        RAISE INFO 'INPUT TABLE SIZE: %', total_size;
-    END IF;
-
-    begin_bld_assoc = clock_timestamp();
-    cur_tr_table = tr_tables[tr_table_index];
-
-    -- The table of tr_assoc holds the information of which records are
-    -- used during training for each tree.
-    -- It has four columns.
-    --     id     --   The id of one record.
-    --     tid    --   The id of a tree.
-    --     nid    --   The id of a node in a tree.
-    --     weight --   The times a record is assigned to a node.
-    IF (sampling_needed) THEN
-        PERFORM MADLIB_SCHEMA.__sample_with_replacement
-            (
-            num_trees,
-            round(sampling_percentage * total_size)::BIGINT,
-            'tmp_dt_hori_table',
-            cur_tr_table
-            );
-    ELSE
-        curstmt = MADLIB_SCHEMA.__format
-             (
-                'INSERT INTO %
-                 SELECT id, 1 as tid, 1 as nid, 1 as weight
-                 FROM %',
-                 ARRAY[
-                    cur_tr_table,
-                    'tmp_dt_hori_table'
-                ]
-             );
-        EXECUTE curstmt;
-    END IF;
-
-    -- analyze ping
-    EXECUTE 'ANALYZE ' || cur_tr_table;
-    bld_assoc_time = clock_timestamp() - begin_bld_assoc;
-
-    -- generate the root node for all trees.
-    -- the generated numbers are the same for the two generate_series
-    SELECT MADLIB_SCHEMA.__format
-        (
-            'INSERT INTO %
-                (id, tree_location, feature, probability, max_class,scv,
-                 live, num_of_samples, parent_id, tid)
-            SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
-                   generate_series(1, %)',
-            ARRAY[
-                result_tree_table_name,
-                num_trees::TEXT,
-                num_trees::TEXT
-            ]
-        ) INTO curstmt;
-
-    EXECUTE curstmt;
-
-    max_nid         = num_trees;
-    location_size   = 0;
-
-
-    LOOP
-        EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
-            ' WHERE live > 0 AND array_upper(tree_location,1)='||
-            curr_level||';' INTO num_live_nodes;
-
-        IF (num_live_nodes < 1) THEN
-            IF(verbosity > 0) THEN
-                RAISE INFO 'EXIT: %', 'no live nodes to split';
-            END IF;
-
-            EXIT;
-        END IF;
-
-        IF (verbosity > 0) THEN
-            RAISE INFO 'Running on level:%', curr_level;
-        END IF;
-
-        begin_olap_acs = clock_timestamp();
-
-        instance_time = MADLIB_SCHEMA.__gen_acc
-            (
-            training_table_name,
-            training_table_meta,
-            result_tree_table_name,
-            cur_tr_table,
-            'sf_assoc',
-            features_per_node,
-            num_classes,
-            sampling_needed,
-            verbosity
-            );
-
-        IF (h2hmv_routine_id=1) THEN
-            -- For ignore, we need the true size of nodes to handle the
-            -- missing values.
-            TRUNCATE node_size_aux;
-
-            curstmt = MADLIB_SCHEMA.__format
-                (
-                    'INSERT INTO node_size_aux
-                     SELECT tr.tid, tr.nid, sum(weight) as count
-                     FROM % tr
-                     GROUP BY tr.tid, tr.nid',
-                    cur_tr_table
-                );
-
-            EXECUTE curstmt;
-        END IF;
-
-        calc_pre_time  = calc_pre_time + instance_time.calc_pre_time;
-        calc_acc_time  = calc_acc_time + instance_time.calc_acc_time;
-        calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
-
-        curr_level = curr_level + 1;
-
-        begin_find_best = clock_timestamp();
-
-        PERFORM MADLIB_SCHEMA.__find_best_split
-               (
-               'training_instance',
-               confidence_level,
-               training_table_meta,
-               sc_code,
-               grow_tree,
-               'find_best_answer_table',
-               h2hmv_routine_id,
-               num_classes
-               );
-        IF (verbosity > 0) THEN
-            RAISE INFO 'find best time at this level:%',
-                clock_timestamp() - begin_find_best;
-        END IF;
-        grow_tree = grow_tree - 1;
-
-        scv_acs_time        = scv_acs_time +
-                              (clock_timestamp() - begin_find_best);
-        begin_data_transfer = clock_timestamp();
-        begin_update_best   = clock_timestamp();
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
-        SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || swap_tree_table || ';
-            CREATE TABLE ' || swap_tree_table || ' AS
-            SELECT t.* FROM ' || result_tree_table_name || ' t left join find_best_answer_table c
-            ON t.id = c.node_id AND t.tid = c.tid
-            WHERE node_id is NULL';
-
-        EXECUTE '
-            INSERT INTO ' || swap_tree_table || '
-            SELECT
-                k.id, k.tree_location, c.feature, c.probability, c.ebp_coeff, c.max_class,
-                c.max_scv, 0, c.node_size, k.parent_id, k.lmc_nid, k.lmc_fval, c.is_cont,
-                c.split_value, k.tid, k.dp_ids
-            FROM ' || result_tree_table_name || ' k inner join find_best_answer_table c
-            on k.id=c.node_id AND k.tid=c.tid';
-
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || result_tree_table_name;
-        PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
-        -- We get the calculation result for current level.
-        -- Update the nodes of previous level firstly.
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'UPDATE % t
-                 SET feature        = c.feature,
-                     probability    = c.probability,
-                     max_class      = c.max_class,
-                     scv            = c.max_scv,
-                     ebp_coeff      = c.ebp_coeff,
-                     num_of_samples = c.node_size,
-                     live           = 0,
-                     is_cont        = c.is_cont,
-                     split_value    = c.split_value
-                 FROM find_best_answer_table c
-                 WHERE t.id=c.node_id AND t.tid=c.tid',
-                 ARRAY[
-                    result_tree_table_name::TEXT
-                 ]
-             ) INTO curstmt;
-
-        EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-        calc_update_best    = calc_update_best +
-                              (clock_timestamp() - begin_update_best);
-        begin_update_child  = clock_timestamp();
-
-        curstmt=
-            MADLIB_SCHEMA.__format(
-                   'INSERT INTO %(id, tree_location, feature, probability,
-                        max_class, scv, live, parent_id, tid, dp_ids)
-                        SELECT %+row, array_append(tree_location, fval),
-                            0, 1, 1, 1, %, ans.node_id, ans.tid,
-                            CASE when(NOT ans.is_cont) then
-                                array_append( dp_ids, ans.feature)
-                            ELSE
-                                dp_ids
-                            END
-                        FROM % tree,
-                        (
-                            SELECT  *,
-                                    row_number()
-                                    OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
-                            FROM
-                            (
-                                SELECT  *,
-                                        CASE WHEN (is_cont) THEN
-                                            generate_series(1,2)
-                                        ELSE
-                                            generate_series(1, distinct_features)
-                                        END AS fval
-                                FROM
-                                find_best_answer_table
-                                WHERE live>0 AND coalesce(feature, 0) <> 0
-                                      AND node_size >= % AND node_size >= %
-                            ) l
-                        ) ans
-                        WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
-                        ARRAY[
-                            result_tree_table_name,
-                            (max_nid)::TEXT,
-                            curr_level::TEXT,
-                            result_tree_table_name,
-                            (total_size * node_prune_threshold)::TEXT,
-                            (total_size * node_split_threshold)::TEXT
-                        ]
-                        );
-        IF(verbosity > 0) THEN
-            RAISE INFO 'Generate Child Nodes:%', curstmt;
-        END IF;
-
-        EXECUTE curstmt;
-
-        EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;
-
-        IF(verbosity > 0) THEN
-            RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
-        END IF;
-
-        -- insert the leftmost child node id and relevant info
-        -- to the assoc_aux table, so that we will make use of this
-        -- info to  update the assigned nid the samples belong to
-        -- the current node whose id is answer.node_id.
-        SELECT MADLIB_SCHEMA.__format
-            (
-                'INSERT INTO assoc_aux
-                 (nid, fid, lmc_id, svalue, is_cont)
-                    SELECT t.id, t.feature, min(l.id),
-                            t.split_value, t.is_cont
-                    FROM
-                        (SELECT id, parent_id
-                        FROM %
-                        WHERE array_upper(tree_location,1)=%) l,
-                        % t
-                    WHERE l.parent_id=t.id
-                    GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
-                ARRAY[
-                    result_tree_table_name,
-                    curr_level::TEXT,
-                    result_tree_table_name
-                ]
-            ) INTO curstmt;
-
-        IF(verbosity > 0) THEN
-            RAISE INFO 'Update lmc_child Info:%', curstmt;
-        END IF;
-
-        EXECUTE curstmt;
-
-        -- delete the unused nodes on the previous level
-        -- delete those nodes with a size less than node_prune_threshold
-        -- node_prune_threshold will not apply to root node,
-        -- the level is 1 (curr_level - 1 = 1);
-
-        IF (curr_level > 2) THEN
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
-        SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || swap_tree_table || ';
-            CREATE TABLE ' || swap_tree_table || ' AS
-            SELECT t.* FROM ' || result_tree_table_name || ' t
-            WHERE (t.num_of_samples is NULL OR t.num_of_samples >= ' || total_size * node_prune_threshold || ')
-                AND t.live != ' || curr_level - 1;
-        EXECUTE '
-            DROP TABLE IF EXISTS ' || result_tree_table_name;
-        PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
-            curstmt = MADLIB_SCHEMA.__format
-                        (
-                            'DELETE FROM % t
-                             WHERE t.num_of_samples < % OR live = %;',
-                            ARRAY[
-                                result_tree_table_name::TEXT,
-                                (total_size * node_prune_threshold)::TEXT,
-                                (curr_level - 1)::TEXT
-                            ]
-                        );
-            EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-        END IF;
-
-        calc_update_child   = calc_update_child + (clock_timestamp() - begin_update_child);
-        begin_update_nid    = clock_timestamp();
-
-        -- update the assigned node id for each sample on the current level
-        tr_table_index = (tr_table_index % 2) + 1;
-        curstmt = MADLIB_SCHEMA.__format
-            (
-                'INSERT INTO % (id, nid, tid, weight)
-                 SELECT
-                    tr.id,
-                    au.lmc_id - 1 +
-                    CASE WHEN (au.is_cont) THEN
-                            CASE WHEN (svalue < vt.fvals[au.fid]) THEN
-                                2
-                            ELSE
-                                1
-                            END
-                    ELSE
-                        vt.fvals[au.fid]::INT
-                    END AS nid,
-                    tid, weight
-                  FROM % tr, % vt, assoc_aux au
-                  WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
-                ARRAY[
-                    tr_tables[tr_table_index],
-                    cur_tr_table,
-                    'tmp_dt_hori_table'
-                ]
-            );
-        IF (verbosity > 0) THEN
-            RAISE INFO '%', curstmt;
-        END IF;
-
-        EXECUTE curstmt;
-        EXECUTE 'TRUNCATE ' || cur_tr_table;
-        cur_tr_table = tr_tables[tr_table_index];
-
-        IF (need_analyze) THEN
-            -- analyze pong table
-            EXECUTE 'ANALYZE ' || cur_tr_table;
-            need_analyze = 'f'::BOOL;
-        END IF;
-
-        EXECUTE 'TRUNCATE assoc_aux';
-
-        calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
-
-        ins_upd_time = ins_upd_time +
-            (clock_timestamp() - begin_data_transfer);
-        IF(verbosity > 0) THEN
-            RAISE INFO 'computation time in this level:%',
-                (clock_timestamp() - begin_find_best);
-        END IF;
-
-    END LOOP;
-
-    PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
-
-    begin_prune = clock_timestamp();
-    IF (confidence_level < 100.0) THEN
-       PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
-    END IF;
-
-    IF (validation_table_name IS NOT NULL) THEN
-       PERFORM MADLIB_SCHEMA.__rep_prune_tree
-                  (
-                  result_tree_table_name,
-                  validation_table_name ,
-                  num_classes
-                  );
-    END IF;
-    prune_time = clock_timestamp() - begin_prune;
-
-    IF(verbosity > 0) THEN
-        RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
-        RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
-        RAISE INFO 'time of calculating ACC: %', calc_acc_time;
-        RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
-        RAISE INFO 'time of pruning: %', prune_time;
-        RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;
-    END IF;
-
-    SELECT MADLIB_SCHEMA.__format
-        (
-            'SELECT COUNT(id), max(array_upper(tree_location, 1))
-             FROM %',
-             ARRAY[
-                result_tree_table_name
-             ]
-        ) INTO curstmt;
-
-    EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
-
-    ret.features_per_node   = features_per_node;
-    ret.num_of_samples        = total_size;
-    ret.calc_acc_time       = calc_acc_time;
-    ret.calc_pre_time       = calc_pre_time;
-    ret.update_time         = ins_upd_time;
-    ret.update_best         = calc_update_best;
-    ret.update_child        = calc_update_child;
-    ret.update_nid          = calc_update_nid;
-    ret.scv_acs_time        = scv_acs_time;
-    ret.prune_time          = prune_time;
-
-    RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief This is an internal function for displaying one tree node in human
- *        readable format. It is the step function of aggregation named
- *        __display_tree_aggr.
- *
- * @param state     This variable is used to store the accumulated tree
- *                  display information.
- * @param depth     The depth of this node.
- * @param is_cont   Whether the feature used to split is continuous.
- * @param feat_name The name of the feature used to split.
- * @param curr_val  The value of the splitting feature for this node.
- * @param split_value    For continuous feature, it specifies the split value.
- *                  Otherwise, it is of no meaning.
- * @param max_prob  For those elements in this node, the probability that
- *                  an element belongs to the max_class.
- * @param max_class The class ID with the largest number of elements
- *                  for those elements in this node.
- * @param num_of_samples Total count of samples in this node.
- *
- * @return It returns the text containing the information of human
- *         readable information for trees.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
-    (
-    state           TEXT,
-    depth           INT,
-    is_cont         BOOLEAN,
-    feat_name       TEXT,
-    curr_val        TEXT,
-    split_value     FLOAT8,
-    max_prob        FLOAT8,
-    max_class       TEXT,
-    num_of_samples  INT
-    )
-RETURNS TEXT AS $$
-DECLARE
-    ret                     TEXT := '';
-    index                   INT;
-BEGIN
-    -- We add indentation based on the depth.
-    FOR index IN 0..depth LOOP
-        ret = ret || '    ';
-    END LOOP;
-
-    IF (depth > 0) THEN
-        ret = ret ||coalesce(feat_name,'null')||': ';
-        -- For continuous features, there are two splits.
-        -- We will mark curr_val to 1 for '<='. Otherwise,
-        -- we will mark curr_val to 2.
-        IF (is_cont) THEN
-            IF (curr_val::INT = 1) THEN
-                ret = ret || ' <= ';
-            ELSE
-                ret = ret || ' > ';
-            END IF;
-            ret = ret||coalesce(split_value,0)||' ';
-        ELSE
-            ret = ret||' = '||coalesce(curr_val,'null')||' ';
-        END IF;
-    ELSE
-        ret = ret||'Root Node ';
-    END IF;
-
-    ret = ret                               ||
-          ' : class('                       ||
-          coalesce(max_class,null)          ||
-          ')   num_elements('               ||
-          coalesce(

<TRUNCATED>