You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/04/01 03:21:28 UTC
[09/11] incubator-madlib git commit: Build: Add support for HAWQ 2.0
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/96f9ac04/methods/cart/src/pg_gp/dt.sql_in
----------------------------------------------------------------------
diff --git a/methods/cart/src/pg_gp/dt.sql_in b/methods/cart/src/pg_gp/dt.sql_in
deleted file mode 100644
index b77fa67..0000000
--- a/methods/cart/src/pg_gp/dt.sql_in
+++ /dev/null
@@ -1,4299 +0,0 @@
-/* ----------------------------------------------------------------------- *//**
- *
- * @file dt.sql_in
- *
- * @brief the common functions written in PL/PGSQL shared by C4.5 and RF
- * @date April 5, 2012
- *
- *//* ----------------------------------------------------------------------- */
-
-m4_include(`SQLCommon.m4')
-
-m4_ifelse(
- m4_eval(
- m4_ifdef(`__GREENPLUM__', 1, 0) &&
- __DBMS_VERSION_MAJOR__ * 10000 +
- __DBMS_VERSION_MINOR__ * 100 +
- __DBMS_VERSION_PATCH__ >= 40201
- ), 1,
- `m4_define(`__GREENPLUM_GE_4_2_1__')'
-)
-
-/*
- * @brief Test if the given table is a valid encoded one.
- * A valid encoded table has the following characteristic:
- * + It has 5 columns, whose names are id, fid, fval,
- * is_cont and class.
- * + The types of the 5 columns are BIGINT, INT, FLOAT8
- * BOOL and INT.
- *
- * @param enc_tbl_name The full name of the encoded table.
- *
- * @return Ture if the given table is a valid encoded one.
- * False if it's an invalid encoded table.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__is_valid_enc_table
- (
- enc_tbl_name TEXT
- )
-RETURNS BOOL AS $$
-DECLARE
- num_enc_table INT;
- num_cols INT;
- ret BOOL := 'f'::BOOL;
-BEGIN
- -- test if the name and the type of a column are valid or not
- SELECT count(*)
- FROM pg_attribute
- WHERE attrelid= enc_tbl_name::regclass::oid AND
- attnum > 0 AND
- not attisdropped AND
- attname in ('id', 'fid', 'fval', 'is_cont', 'class') AND
- atttypid in ('int8'::regtype, 'int'::regtype, 'float8'::regtype,
- 'bool'::regtype, 'int'::regtype)
- INTO num_cols;
-
- IF (num_cols = 5) THEN
- ret = 't'::BOOL;
- END IF;
-
- RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `READS SQL DATA', `');
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_sfunc
- (
- result FLOAT8[], -- intermediate result
- scv FLOAT8[],
- fid INT,
- split_value FLOAT8
- )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_best_scv_sfunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__best_scv_prefunc
- (
- sfunc1_result FLOAT8[],
- sfunc2_result FLOAT8[]
- )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_best_scv_prefunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__best_scv_aggr
- (
- FLOAT8[], -- scv
- INT, -- fid
- FLOAT8 -- split_value
- ) CASCADE;
-CREATE
-AGGREGATE MADLIB_SCHEMA.__best_scv_aggr
- (
- FLOAT8[], -- scv
- INT, -- fid
- FLOAT8 -- split_value
- )
-(
- SFUNC=MADLIB_SCHEMA.__best_scv_sfunc,
- m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__best_scv_prefunc,')
- STYPE=FLOAT8[],
- initcond = '{0, 0, 0, 0, 0, 0, 0}'
-);
-
-
-/*
- * @brief The step function is defined to process each record in the ACS set.
- * The records have this format:
- * {fid, fval, is_cont, split_value, le, total, tid, nid}
- *
- * @param result The array used to keep the best attribute's info.
- * @param sc_code The code of the split criterion.
- * @param is_cont True - The feature is continuous.
- * False - The feature is discrete.
- * @param num_class The total number of classes.
- * @param le_array The le component of the ACS record. le_array[i] is the
- * number of samples whose class code equals to i and
- * whose fval is less-than or equal to the fval component
- * of the ACS record being processed.
- * @param total_array The total component of the ACS record. total_array[i] is
- * the number of samples whose class code equals to i.
- * @param true_total The real total number of samples currently assigned to
- * the node identified by (tid, nid). If there are missing
- * values in fval, the sum of all elements in total_array
- * will be less than true_total.
- *
- * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
- * in dt.c for the detailed information of this array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_sfunc
- (
- result FLOAT8[],
- sc_code INT,
- is_cont BOOLEAN,
- num_class INT,
- le_array FLOAT8[],
- total_array FLOAT8[],
- true_total BIGINT
- )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_scv_aggr_sfunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The pre-function for the aggregation of splitting criteria values. It
- * takes the state array produced by two sfunc and combine them together.
- *
- * @param sfunc1_result The array from sfunc1.
- * @param sfunc2_result The array from sfunc2.
- *
- * @return A 9-element array. Please refer to the definition of SCV_STATE_ARRAY_INDEX
- * in dt.c for the detailed information of this array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_prefunc
- (
- sfunc1_result FLOAT8[],
- sfunc2_result FLOAT8[]
- )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_scv_aggr_prefunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The final function for the aggregation of splitting criteria values.
- * It takes the state array produced by the sfunc and produces a
- * 5-element array.
- *
- * @param internal_result The 9-element array produced by dt_scv_aggr_prefunc
- *
- * @return A 5-element array. Please refer to the definition of SCV_FINAL_ARRAY_INDEX
- * in dt.c for the detailed information of this array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__scv_aggr_ffunc
- (
- internal_result FLOAT8[]
- )
-RETURNS FLOAT8[]
-AS 'MODULE_PATHNAME', 'dt_scv_aggr_ffunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__scv_aggr
- (
- INT, -- sc
- BOOLEAN, -- is_cont
- INT, -- total number of classes
- FLOAT8[], -- le array
- FLOAT8[], -- total count array
- BIGINT -- the total number of samples
- ) CASCADE;
-CREATE
-AGGREGATE MADLIB_SCHEMA.__scv_aggr
- (
- INT, -- sc
- BOOLEAN, -- is_cont
- INT, -- total number of classes
- FLOAT8[], -- le array
- FLOAT8[], -- total count array
- BIGINT -- the total number of samples
- )
-(
- SFUNC=MADLIB_SCHEMA.__scv_aggr_sfunc,
- m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__scv_aggr_prefunc,')
- FINALFUNC=MADLIB_SCHEMA.__scv_aggr_ffunc,
- STYPE=FLOAT8[],
- initcond = '{0, 0, 0, 0, 0, 0, 0, 0, 0}'
- -- 1 sc: 1 infogain, 2 gainratio, 3 gini
- -- 2 is_cont
- -- 3 scv_class_info
- -- 4 scv_attr_info
- -- 5 scv_class_attr_info
- -- 6 scv_count
- -- 7 scv_total
- -- 8 max_class_id
- -- 9 max_class_count
-);
-
-
-/*
- * @brief Retrieve the specified number of unique features for a node.
- * Discrete features used by ancestor nodes will be excluded.
- * If the number of remaining features is less or equal than the
- * requested number of features, then all the remaining features
- * will be returned. Otherwise, we will sample the requested
- * number of features from the remaining features.
- *
- * @param num_req_features The number of requested features.
- * @param num_features The total number of features.
- * @param nid The ID of the node for which the
- * features are sampled.
- * @param dp_fids The IDs of the discrete features
- * used by the ancestors.
- *
- * @return An array containing all the IDs of chosen features.
- *
- */
-CREATE OR REPLACE FUNCTION
-MADLIB_SCHEMA.__dt_get_node_split_fids(INT4, INT4, INT4, INT4[])
-RETURNS INT[]
-AS 'MODULE_PATHNAME', 'dt_get_node_split_fids'
-LANGUAGE C VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief Retrieve the selected features for a node. We will create a table, named
- * sf_association, to store the association between selected feature IDs and
- * node IDs.
- *
- * @param nid_table_name The full name of the table which contains all the
- * node IDs.
- * @param result_table_name The full name of the table which contains the parent
- * discrete features for each node.
- * @param num_chosen_fids The number of feature IDs will be chosen for a node.
- * @param total_num_fids The total number of feature IDs, total_num_fids
- * >= num_chosen_fids.
- * If num_chosen_fids < total_num_fids, then we will
- * randomly select num_chosen_fids features from all
- * the features. Otherwise, we will return all the
- * features exception they belong to the parent discrete
- * features for a node.
- * @param verbosity > 0 means this function runs in verbose mode.
- *
- * @return An constant string for the association table name.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__get_features_of_nodes
- (
- nid_table_name TEXT,
- result_table_name TEXT,
- num_chosen_fids INT,
- total_num_fids INT,
- verbosity INT
- )
-RETURNS TEXT AS $$
-DECLARE
- curstmt TEXT;
-BEGIN
- -- The sf_association table records which features are used
- -- for finding the best split for a node.
- -- It has two columns:
- -- nid -- The id of a node.
- -- fid -- The id of a feature.
- EXECUTE 'TRUNCATE sf_assoc';
-
- curstmt = MADLIB_SCHEMA.__format
- (
- 'INSERT INTO sf_assoc(nid, fid)
- SELECT
- nid,
- unnest(MADLIB_SCHEMA.__dt_get_node_split_fids(%, %,
- nid,dp_ids)) as fid
- FROM (SELECT nid, dp_ids
- FROM % s1, % s2
- WHERE s1.nid = s2.id
- GROUP BY nid, dp_ids) t',
- ARRAY[
- num_chosen_fids::TEXT,
- total_num_fids::TEXT,
- nid_table_name,
- result_table_name
- ]
- );
-
- IF (verbosity > 0) THEN
- RAISE INFO 'build sample feature association stmt: %', curstmt;
- END IF;
-
- EXECUTE curstmt;
-
- -- we return an constant string for the association table name
- return 'sf_assoc';
-
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * This UDT is used to keep the times of generating acc.
- *
- * calc_pre_time The time of pre-processing.
- * calc_acc_time The time of calculating acc.
- *
- */
-DROP TYPE IF EXISTS MADLIB_SCHEMA.__gen_acc_time CASCADE;
-CREATE TYPE MADLIB_SCHEMA.__gen_acc_time AS
-(
- calc_pre_time INTERVAL,
- calc_acc_time INTERVAL
-);
-
-
-/*
- * @brief Generate the ACC for current leaf nodes.
- *
- * @param encoded_table_name The full name of the encoded table for the
- * training table.
- * @param metatable_name The full name of the metatable contains the
- * relevant information of the input table.
- * @param result_table_name The full name of the training result table.
- * @param num_featrue_try The number of features will be chosen per node.
- * @param num_classes Total number of classes in training set.
- * @param verbosity > 0 means this function runs in verbose mode.
- *
- * @return The time information for generating ACC.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__gen_acc
- (
- encoded_table_name TEXT,
- metatable_name TEXT,
- result_table_name TEXT,
- tr_table_name TEXT,
- sf_table_name TEXT,
- num_featrue_try INT,
- num_classes INT,
- sampling_needed BOOLEAN,
- verbosity INT
- )
-RETURNS MADLIB_SCHEMA.__gen_acc_time AS $$
-DECLARE
- curstmt TEXT := '';
- num_fids INT := 1;
- begin_calc_acc TIMESTAMP;
- begin_calc_pre TIMESTAMP;
- ret MADLIB_SCHEMA.__gen_acc_time;
- select_stmt TEXT;
-BEGIN
- begin_calc_pre = clock_timestamp();
-
- -- get the number of features
- curstmt = MADLIB_SCHEMA.__format
- (
- 'SELECT COUNT(id)
- FROM %
- WHERE column_type = ''f''',
- metatable_name
- );
- EXECUTE curstmt INTO num_fids;
-
- -- preprocessing time
- ret.calc_pre_time = clock_timestamp() - begin_calc_pre;
- begin_calc_acc = clock_timestamp();
-
- IF (sampling_needed) THEN
- PERFORM MADLIB_SCHEMA.__get_features_of_nodes
- (
- tr_table_name,
- result_table_name,
- num_featrue_try,
- num_fids,
- verbosity
- );
-
- select_stmt = MADLIB_SCHEMA.__format
- (
- 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
- ed.class, sum(weight) as count
- FROM % ed, % tr, % sf
- WHERE tr.nid = sf.nid AND ed.fid = sf.fid AND ed.id = tr.id
- GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
- ed.is_cont, ed.class',
- ARRAY[
- encoded_table_name,
- tr_table_name,
- sf_table_name
- ]
- );
- ELSE
- select_stmt = MADLIB_SCHEMA.__format
- (
- 'SELECT tr.tid, tr.nid, ed.fid, ed.fval, ed.is_cont,
- ed.class, sum(weight) as count
- FROM % ed, % tr
- WHERE ed.id = tr.id
- GROUP BY tr.tid, tr.nid, ed.fid, ed.fval,
- ed.is_cont, ed.class',
- ARRAY[
- encoded_table_name,
- tr_table_name
- ]
- );
- END IF;
- DROP TABLE IF EXISTS training_instance_aux;
- m4_changequote(`<!', `!>')
- curstmt = MADLIB_SCHEMA.__format
- (
- 'CREATE TEMP TABLE training_instance_aux AS
- SELECT tid, nid, fid, fval, is_cont,
- MADLIB_SCHEMA.__dt_acc_count_aggr
- (%,count::BIGINT,class::INT) AS count
- FROM
- (
- %
- ) l
- GROUP BY tid,nid,fid, fval,is_cont
- m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid, fval)!>)',
- ARRAY[
- num_classes::TEXT,
- select_stmt
- ]
- );
- m4_changequote(<!`!>, <!'!>)
-
- IF ( verbosity>0 ) THEN
- RAISE INFO '%', curstmt;
- END IF;
-
- EXECUTE curstmt;
- ret.calc_acc_time = clock_timestamp() - begin_calc_acc;
-
- RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-DROP TYPE IF EXISTS MADLIB_SCHEMA.__rep_type CASCADE;
-CREATE TYPE MADLIB_SCHEMA.__rep_type AS
- (
- numOfOrgClasses BIGINT[]
- );
-
-
-/*
- * @brief The step function for aggregating the class counts while doing Reduce
- * Error Pruning (REP).
- *
- * @param class_count_array The array used to store the accumulated information.
- * [0]: the total number of mis-classified samples.
- * [i]: the number of samples belonging to the ith class.
- * @param classified_class The predicted class based on our trained DT model.
- * @param original_class The real class value provided in the validation set.
- * @param max_num_of_classes The total number of distinct class values.
- *
- * @return An updated class count array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_sfunc
- (
- class_count_array BIGINT[],
- classified_class INT,
- original_class INT,
- max_num_of_classes INT
- )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_sfunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief Add the corresponding elements of the input arrays
- * to create a new one.
- *
- * @param 1 arg The array 1.
- * @param 2 arg The array 2.
- *
- * @return The new array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__bigint_array_add
- (
- BIGINT[],
- BIGINT[]
- )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'bigint_array_add'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The final function for aggregating the class counts for REP.
- * It takes the class count array produced by the sfunc and produces a
- * two-element array. The first element is the ID of the class that has
- * the maximum number of samples represented by the root node of the subtree
- * being processed. The second element is the number of reduced
- * misclassified samples if the leave nodes of the subtree are pruned.
- *
- * @param class_count_data The array containing all the information for the
- * calculation of Reduced-Error pruning.
- *
- * @return A two element array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_aggr_class_count_ffunc
- (
- class_count_array BIGINT[]
- )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'dt_rep_aggr_class_count_ffunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__rep_aggr_class_count
- (
- INT,
- INT,
- INT
- );
-CREATE AGGREGATE MADLIB_SCHEMA.__rep_aggr_class_count
- (
- INT,
- INT,
- INT
- )
-(
- SFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_sfunc,
- m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
- FINALFUNC=MADLIB_SCHEMA.__rep_aggr_class_count_ffunc,
- STYPE=BIGINT[]
-);
-
-
-/*
- * @brief The step function of the aggregate __array_indexed_agg.
- *
- * @param state The step state array of the aggregate function.
- * @param elem The element to be filled into the state array.
- * @param elem_cnt The number of elements.
- * @param elem_idx the subscript of "elem" in the state array.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_sfunc
- (
- state float8[],
- elem float8,
- elem_cnt int8,
- elem_idx int8
- )
-RETURNS float8[]
-AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_sfunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The Pre-function of the aggregate __array_indexed_agg.
- *
- * @param arg0 The first state array.
- * @param arg1 The second state array.
- *
- * @return The combined state.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_prefunc
- (
- float8[],
- float8[]
- )
-RETURNS float8[]
-AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_prefunc'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The final function of __array_indexed_agg.
- *
- * @param state The state array.
- *
- * @return The aggregate result.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__array_indexed_agg_ffunc
- (
- float8[]
- )
-RETURNS float8[]
-AS 'MODULE_PATHNAME', 'dt_array_indexed_agg_ffunc'
-LANGUAGE C IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The aggregate is the same with array_agg, which will accumulate
- * The elements in each group to an array, except that we allow users
- * provide the subscript for each element. This aggregate will be
- * invoked as HashAggregate, while array_agg will be called as
- * GroupAggregate. Therefore, our implementation have better performance
- * than the array_agg.
- *
- * @param elem The element to be fed into the returned array of this aggregate.
- * @param elem_cnt The number of elements.
- * @param elem_idx The subscript of the element.
- *
- * @return The aggregated array.
- *
- */
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8);
-CREATE AGGREGATE MADLIB_SCHEMA.__array_indexed_agg(float8, int8, int8) (
- SFUNC = MADLIB_SCHEMA.__array_indexed_agg_sfunc,
- m4_ifdef( `__POSTGRESQL__', `', `PREFUNC = MADLIB_SCHEMA.__array_indexed_agg_prefunc,')
- FINALFUNC = MADLIB_SCHEMA.__array_indexed_agg_ffunc,
- STYPE = float8[]
-);
-
-
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__dt_acc_count_sfunc
- (
- count_array BIGINT[],
- num_of_class INT,
- count BIGINT,
- class INT
- )
-RETURNS BIGINT[]
-AS 'MODULE_PATHNAME', 'dt_acc_count_sfunc'
-LANGUAGE C VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__dt_acc_count_aggr (INT, BIGINT, INT);
-CREATE AGGREGATE MADLIB_SCHEMA.__dt_acc_count_aggr
- (
- INT,
- BIGINT,
- INT
- )
-(
- SFUNC=MADLIB_SCHEMA.__dt_acc_count_sfunc,
- m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
- STYPE=BIGINT[]
-);
-
-
-/*
- * @brief The aggregate is created for the PostgreSQL, which doesn't support the
- * function sum over an array.
- *
- * @param elem The element to be fed into the returned array of this aggregate.
- *
- * @return The array with the sum of all the input array in a group.
- *
- */
-DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.__bigint_array_sum (BIGINT[]);
-CREATE AGGREGATE MADLIB_SCHEMA.__bigint_array_sum
- (
- BIGINT[]
- )
-(
- SFUNC=MADLIB_SCHEMA.__bigint_array_add,
- m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.__bigint_array_add,')
- STYPE=BIGINT[]
-);
-
-
-/*
- * @brief This function find the best split and return the information.
- *
- * @param table_name The name of the table containing the training
- * set.
- * @param confidence_level This parameter is used by the 'Error-Based Pruning'.
- * Please refer to the paper for detailed definition.
- * The paper's name is 'Error-Based Pruning of Decision
- * Trees Grown on Very Large Data Sets Can Work!'.
- * @param feature_table_name Is is the name of one internal table, which contains
- * meta data for each feature.
- * @param split_criterion It defines the split criterion to be used.
- * (1- information gain. 2- gain ratio. 3- gini).
- * @param continue_grow It specifies whether we should still grow the tree
- * on the selected branch.
- * @param output_table It specifies the table used to store the chosen splits.
- * @param h2hmv_routine_id Specifies how to handle missing values.
- * 1 ignore, 2 explicit.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__find_best_split
- (
- table_name TEXT,
- confidence_level FLOAT,
- feature_table_name TEXT,
- split_criterion INT,
- continue_grow INT,
- output_table TEXT,
- h2hmv_routine_id INT,
- num_classes INT
- )
-RETURNS VOID AS $$
-DECLARE
- total_size INT;
- curstmt TEXT := '';
- begin_func_exec TIMESTAMP;
- select_stmt TEXT;
-BEGIN
- begin_func_exec = clock_timestamp();
-
- m4_changequote(`<!', `!>')
- IF (h2hmv_routine_id=1) THEN
- -- For ignore, we need the true size of nodes to handle the missing values.
- select_stmt =
- 'SELECT t1.tid, t1.nid, t1.fid, t1.total, t2.node_size::BIGINT
- FROM
- (
- SELECT tid, nid, fid,
- m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total
- FROM training_instance_aux
- GROUP BY tid, nid, fid
- ) t1 INNER JOIN node_size_aux t2
- ON t1.tid=t2.tid AND t1.nid=t2.nid';
- ELSE
- -- For explicit, the calculated node size from the aggregation is correct.
- -- We can set NULL, which denotes we can safely use the counted value.
- select_stmt =
- 'SELECT tid, nid, fid,
- m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) as total,
- NULL::BIGINT AS node_size
- FROM training_instance_aux
- GROUP BY tid, nid, fid';
- END IF;
-
- /*
- * This table is used to store information for the calculated best split
- *
- * tid The ID of the tree.
- * node_id The ID of one node in the specified tree.
- * feature The ID of the selected feature.
- * probability The predicted probability of our chosen class.
- * max_class The ID of the class chosen by the algorithm.
- * max_scv The maximum split criterion value.
- * live 1- For the chosen split, we should split further.
- * 0- For the chosen split, we shouldn't split further.
- * ebp_coeff total error for error-based pruning.
- * is_cont whether the selected feature is continuous.
- * split_value If the selected feature is continuous, it specifies
- * the split value. Otherwise, it is of no use.
- * distinct_features The number of distinct values for the selected feature.
- * node_size The size of this tree node.
- *
- */
- EXECUTE 'DROP TABLE IF EXISTS '||output_table;
- EXECUTE 'CREATE TEMP TABLE '||output_table||'
- (
- tid INT,
- node_id INT,
- feature INT,
- probability FLOAT,
- max_class INTEGER,
- max_scv FLOAT,
- live INT,
- ebp_coeff FLOAT,
- is_cont BOOLEAN,
- split_value FLOAT,
- distinct_features INT,
- node_size INT
- ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (node_id)!>);';
-
-
- EXECUTE 'DROP TABLE IF EXISTS tmp_best_table';
-
- SELECT MADLIB_SCHEMA.__format
- (
- 'INSERT INTO %
- SELECT tid, nid, best_scv[6], best_scv[4], best_scv[3], best_scv[1],
- CASE WHEN (best_scv[1] < 1e-9 OR
- best_scv[4] > 1-1e-9 OR % <= 0 ) THEN
- 0
- ELSE
- 1
- END AS live,
- MADLIB_SCHEMA.__ebp_calc_errors
- (best_scv[5], best_scv[4], %) AS ebp_coeff,
- o2.is_cont,
- CASE WHEN( o2.is_cont ) THEN
- best_scv[7]
- ELSE
- NULL
- END AS split_value,
- o2.num_dist_value, best_scv[5]
- FROM
- (
- SELECT s1.tid, s1.nid,
- MADLIB_SCHEMA.__best_scv_aggr(scv, s1.fid,
- coalesce(s1.split_value,0)) as best_scv
- FROM (
- SELECT t1.tid, t1.nid, t1.fid, split_value,
- MADLIB_SCHEMA.__scv_aggr
- (%, is_cont, %, le, total, t2.node_size) AS scv
- FROM
- (
- SELECT tid, nid, fid, fval, is_cont,
- CASE WHEN (is_cont) THEN
- fval
- ELSE
- NULL::FLOAT8
- END AS split_value,
- CASE WHEN (is_cont) THEN
- m4_ifdef(<!__POSTGRESQL__!>, <!MADLIB_SCHEMA.__bigint_array_sum(count)!>, <!sum(count)!>) OVER
- (PARTITION BY tid, nid, fid ORDER BY fval
- ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
- ELSE
- count
- END AS le
- FROM training_instance_aux
- ) t1,
- (
- %
- ) t2
- WHERE t1.tid = t2.tid AND t1.nid = t2.nid AND t1.fid = t2.fid
- GROUP BY t1.tid, t1.nid, t1.fid, split_value
- ) s1
- GROUP BY s1.tid, s1.nid
- ) o1 INNER JOIN % o2 ON o1.best_scv[6]::INT=o2.id',
- ARRAY[
- output_table,
- continue_grow::TEXT,
- confidence_level::TEXT,
- split_criterion::TEXT,
- num_classes::TEXT,
- select_stmt,
- feature_table_name
- ]
- ) INTO curstmt;
-
- EXECUTE curstmt;
- m4_changequote(<!`!>, <!'!>)
-
- RETURN;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief For training one decision tree, we need some internal tables
- * to store intermediate results. This function creates those
- * tables. Moreover, this function also creates the tree table
- * specified by user.
- *
- * @param result_tree_table_name The name of the tree specified by user.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__create_tree_tables
- (
- result_tree_table_name TEXT
- )
-RETURNS void AS $$
-BEGIN
- -- The table of node_size_aux records the size of each node. It is used
- -- for missing value handling.
- DROP TABLE IF EXISTS node_size_aux CASCADE;
- CREATE TEMP TABLE node_size_aux
- (
- tid INT,
- nid INT,
- node_size INT
- )m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (tid,nid)');
-
- -- The table below stores the decision tree information just constructed.
- -- Columns:
- -- id: The ID of the node represented by this row. Tree
- -- node IDs are unique across all trees. The IDs of
- -- all children of a node is made to be continuous.
- -- tree_location: An array containing the encoded values of all the
- -- features on the path from the root node to the
- -- current node. For the root node, the location
- -- value is {0}.
- -- feature: The ID of the best split feature chosen for the
- -- node represented by this row.
- -- probability: If forced to make a call for a dominant class
- -- at a given point this would be the confidence of the
- -- call (this is only an estimated value).
- -- ebp_coeff: The total errors used by error based pruning (ebp)
- -- based on the specified confidence level. RF does
- -- not do EBP therefore for RF nodes, this column always
- -- contains 1.
- -- max_class: If forced to make a call for a dominant class
- -- at a given point this is the selected class.
- -- scv: The splitting criteria value (scv) computed at this node.
- -- live: Specifies whether the node should be further split
- -- or not. A positive value indicates further split of
- -- the node represented by this row is needed.
- -- num_of_samples: The number of samples at this node.
- -- parent_id: Id of the parent branch.
- -- lmc_nid: Leftmost child (lmc) node id of the node represented
- -- by the current row.
- -- lmc_fval: The feature value which leads to the lmc node.
- -- An example of getting all the child nodes' ids
- -- and condition values
- -- 1. Get the right most node id
- -- SELECT DISTINCT ON(parent_id) id FROM tree_table
- -- WHERE parent_id = $pid ORDER BY parent_id, id desc
- -- INTO max_child_nid;
- -- 2. Get child nodes' ids and condition values by a
- -- while loop
- -- node_count = 1;
- -- WHILE (lmc_nid IS NOT NULL) AND
- -- (0 < node_count AND lmc_nid <= max_child_nid) LOOP
- -- ...
- -- lmc_nid = lmc_nid + 1;
- -- lmc_fval = lmc_fval + 1;
- -- SELECT COUNT(id) FROM tree_table
- -- WHERE id = $lmc_nid AND parent_id = $pid
- -- INTO node_count;
- -- END LOOP;
- -- is_cont: It specifies whether the selected feature is a
- -- continuous feature.
- -- split_value: For continuous feature, it specifies the split value.
- -- Otherwise, it is of no meaning and fixed to 0.
- -- tid: The id of a tree that this node belongs to.
- -- dp_ids: An array containing the IDs of the non-continuous
- -- features chosen by all ancestors nodes (starting
- -- from the root) for splitting.
- --
- -- The table below stores the final decision tree information.
- -- It is an the table specified by users.
- -- Please refer the table above for detailed column definition.
- m4_changequote(`<!', `!>')
- EXECUTE 'DROP TABLE IF EXISTS '||result_tree_table_name||' CASCADE;';
- EXECUTE 'CREATE TABLE '||result_tree_table_name||'
- (
- id INT,
- tree_location INT[],
- feature INT,
- probability FLOAT,
- ebp_coeff FLOAT,
- max_class INTEGER,
- scv FLOAT,
- live INT,
- num_of_samples INT,
- parent_id INT,
- lmc_nid INT,
- lmc_fval INT,
- is_cont BOOLEAN,
- split_value FLOAT,
- tid INT,
- dp_ids INT[]
- ) m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (tid,id)!>);';
- m4_changequote(<!`!>, <!'!>)
-
- -- The following table stored the auxiliary information for updating the
- -- association table, so that the updating operation only need to
- -- join the encoded table with association table once
- EXECUTE 'DROP TABLE IF EXISTS assoc_aux CASCADE';
- CREATE TEMP TABLE assoc_aux
- (
- nid INT,
- fid INT,
- lmc_id INT,
- svalue FLOAT,
- is_cont BOOL
- ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (nid)');
-
- EXECUTE 'DROP TABLE IF EXISTS tr_assoc_ping CASCADE';
- EXECUTE 'DROP TABLE IF EXISTS tr_assoc_pong CASCADE';
- EXECUTE 'DROP TABLE IF EXISTS sf_assoc CASCADE';
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__GREENPLUM_GE_4_2_1__!>, <!
- CREATE TEMP TABLE tr_assoc_ping
- (
- id BIGINT ENCODING (compresstype=RLE_TYPE),
- nid INT ENCODING (compresstype=RLE_TYPE),
- tid INT ENCODING (compresstype=RLE_TYPE),
- weight INT ENCODING (compresstype=RLE_TYPE)
- )
- WITH(appendonly=true, orientation=column)
- DISTRIBUTED BY(id);
-
- CREATE TEMP TABLE tr_assoc_pong
- (
- id BIGINT ENCODING (compresstype=RLE_TYPE),
- nid INT ENCODING (compresstype=RLE_TYPE),
- tid INT ENCODING (compresstype=RLE_TYPE),
- weight INT ENCODING (compresstype=RLE_TYPE)
- )
- WITH(appendonly=true, orientation=column)
- DISTRIBUTED BY(id);
-
- CREATE TEMP TABLE sf_assoc
- (
- nid INT ENCODING (compresstype=RLE_TYPE),
- fid INT ENCODING (compresstype=RLE_TYPE)
- )
- WITH(appendonly=true, orientation=column)
- DISTRIBUTED BY(fid);
-!>, <!
- CREATE TEMP TABLE tr_assoc_ping
- (
- id BIGINT,
- nid INT,
- tid INT,
- weight INT
- )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
- CREATE TEMP TABLE tr_assoc_pong
- (
- id BIGINT,
- nid INT,
- tid INT,
- weight INT
- )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>);
- CREATE TEMP TABLE sf_assoc
- (
- nid INT,
- fid INT
- )m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (fid)!>);
-!>)
-m4_changequote(<!`!>, <!'!>)
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief Prune the trained tree with "Reduced Error Pruning" algorithm.
- *
- * @param tree_table_name The name of the table containing the tree.
- * @param validation_table The name of the table containing validation set.
- * @param max_num_classes The count of different classes.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__rep_prune_tree
- (
- tree_table_name TEXT,
- validation_table TEXT,
- max_num_classes INT
- )
-RETURNS void AS $$
-DECLARE
- num_parent_ids INTEGER;
- cf_table_name TEXT;
- encoded_table_name TEXT;
- metatable_name TEXT;
- curstmt TEXT;
- class_col_name TEXT;
- classify_result TEXT;
- temp_text TEXT;
- n INT;
- table_names TEXT[];
- swap_tree_table TEXT;
-BEGIN
- metatable_name = tree_table_name || '_di';
- class_col_name = MADLIB_SCHEMA.__get_class_column_name(metatable_name);
-
- -- the value of class column in validation table must in the KV table
- SELECT MADLIB_SCHEMA.__format
- (
- 'SELECT COUNT(*)
- FROM %
- WHERE MADLIB_SCHEMA.__to_char(%) NOT IN
- (SELECT fval FROM % WHERE fval IS NOT NULL)',
- ARRAY[
- validation_table,
- class_col_name,
- MADLIB_SCHEMA.__get_classtable_name(metatable_name)
- ]
- )
- INTO curstmt;
-
- EXECUTE curstmt INTO n;
-
- PERFORM MADLIB_SCHEMA.__assert
- (
- n = 0,
- 'the value of class column in validation table must in
- training table'
- );
-
- table_names = MADLIB_SCHEMA.__treemodel_classify_internal
- (
- validation_table,
- tree_table_name,
- 0
- );
-
- encoded_table_name = table_names[1];
- classify_result = table_names[2];
- cf_table_name = classify_result;
-
- -- after encoding in classification, class_col_name is fixed to class
- class_col_name = 'class';
-
- m4_changequote(`<!', `!>')
- m4_ifdef(<!__HAWQ__!>, <!
- EXECUTE 'DROP TABLE IF EXISTS tree_rep_pong CASCADE';
- EXECUTE 'CREATE TEMP TABLE tree_rep_pong AS SELECT * FROM ' ||
- classify_result ||
- ' LIMIT 0 m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!DISTRIBUTED BY (id)!>)';
- !>)
- m4_changequote(<!`!>, <!'!>)
-
- LOOP
- DROP TABLE IF EXISTS selected_parent_ids_rep;
- CREATE TEMP TABLE selected_parent_ids_rep
- (
- parent_id BIGINT,
- max_class INT
- ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)');
-
- SELECT MADLIB_SCHEMA.__format
- (
- 'INSERT INTO selected_parent_ids_rep
- SELECT parent_id, t.g[1] as max_class
- FROM
- (
- SELECT parent_id,
- MADLIB_SCHEMA.__rep_aggr_class_count
- (
- c.class,
- s.%,
- %
- ) AS g
- FROM % c, % s
- WHERE c.id=s.id
- GROUP BY parent_id
- ) t
- WHERE t.g[2] >= 0 AND
- t.parent_id IN
- (
- Select parent_id FROM %
- WHERE parent_id NOT IN
- (
- Select parent_id
- FROM %
- WHERE lmc_nid IS NOT NULL
- ) and id <> 1
- );',
- ARRAY[
- class_col_name,
- MADLIB_SCHEMA.__to_char(max_num_classes),
- classify_result,
- encoded_table_name,
- tree_table_name,
- tree_table_name
- ]
- )
- INTO curstmt;
-
- EXECUTE curstmt;
-
- EXECUTE 'SELECT parent_id FROM selected_parent_ids_rep limit 1;'
- INTO num_parent_ids;
- IF (num_parent_ids IS NULL) THEN
- EXIT;
- END IF;
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
- -- for some databases, update operation can't distribute data across segments
- -- we use two tables to update the data
- IF (classify_result = 'tree_rep_pong') THEN
- temp_text = cf_table_name;
- ELSE
- temp_text = 'tree_rep_pong';
- END IF;
-
- EXECUTE 'TRUNCATE ' || temp_text;
- SELECT MADLIB_SCHEMA.__format
- (
- 'INSERT INTO %(id, class, parent_id, leaf_id)
- SELECT m.id, t.max_class, t.parent_id, t.id
- FROM % m, % t
- WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
- m.parent_id = t.id',
- ARRAY[
- temp_text,
- classify_result,
- tree_table_name
- ]
- )
- INTO curstmt;
-
- EXECUTE curstmt;
-
- classify_result = temp_text;
-!>, <!
- SELECT MADLIB_SCHEMA.__format
- (
- 'UPDATE % m set class = t.max_class,
- parent_id = t.parent_id,leaf_id = t.id
- FROM % t
- WHERE t.id IN (SELECT parent_id FROM selected_parent_ids_rep) AND
- m.parent_id=t.id',
- classify_result,
- tree_table_name
- )
- INTO curstmt;
- EXECUTE curstmt;
-!>)
-
-m4_ifdef(<!__HAWQ__!>, <!
- SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
- EXECUTE '
- DROP TABLE IF EXISTS ' || swap_tree_table || ';
- CREATE TABLE ' || swap_tree_table || ' AS
- SELECT * FROM ' || tree_table_name || '
- WHERE id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)
- AND parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';
-
- EXECUTE '
- INSERT INTO ' || swap_tree_table || '
- SELECT
- t1.id, t1.tree_location, t1.feature, t1.probability,
- t1.ebp_coeff, t2.max_class, t1.scv, t1.live, t1.num_of_samples,
- t1.parent_id, NULL, NULL, t1.is_cont, t1.split_value, t1.tid,
- t1.dp_ids
- FROM ' || tree_table_name || ' t1 inner join selected_parent_ids_rep t2
- ON t1.id = t2.parent_id
- WHERE t1.parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_rep)';
-
- EXECUTE '
- DROP TABLE IF EXISTS ' || tree_table_name;
- PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
-!>, <!
- SELECT MADLIB_SCHEMA.__format
- (
- 'DELETE FROM % WHERE parent_id IN
- (SELECT parent_id FROM selected_parent_ids_rep)',
- tree_table_name
- )
- INTO curstmt;
-
- EXECUTE curstmt;
-
- SELECT MADLIB_SCHEMA.__format
- (
- 'UPDATE % t1 SET lmc_nid = NULL,
- lmc_fval = NULL, max_class = t2.max_class
- FROM selected_parent_ids_rep t2
- WHERE t1.id = t2.parent_id;',
- tree_table_name
- )
- INTO curstmt;
-
- EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
- END LOOP;
-
- EXECUTE 'DROP TABLE IF EXISTS ' || encoded_table_name || ' CASCADE;';
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief Calculates the total errors used by Error Based Pruning (EBP).
- *
- * @param total The number of total samples represented by the node
- * being processed.
- * @param prob The probability to mis-classify samples represented by the
- * child nodes if they are pruned with EBP.
- * @param confidence_level A certainty factor to calculate the confidence limits
- * for the probability of error using the binomial theorem.
- *
- * @return The computed total error.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_calc_errors
- (
- total FLOAT8,
- prob FLOAT8,
- confidence_level FLOAT8
- ) RETURNS FLOAT8
-AS 'MODULE_PATHNAME', 'dt_ebp_calc_errors'
-LANGUAGE C STRICT IMMUTABLE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief Prune the trained tree with "Error-based Pruning" algorithm.
- *
- * @param tree_table_name The name of the table containing the tree.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__ebp_prune_tree
- (
- tree_table_name TEXT
- )
-RETURNS void AS $$
-DECLARE
- num_parent_ids INTEGER;
- curstmt TEXT;
- swap_tree_table TEXT;
-BEGIN
- LOOP
- DROP TABLE IF EXISTS selected_parent_ids_ebp;
- CREATE TEMP TABLE selected_parent_ids_ebp(parent_id BIGINT)
- m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(parent_id)');
-
- SELECT MADLIB_SCHEMA.__format
- (
- 'INSERT INTO selected_parent_ids_ebp
- SELECT s.parent_id as parent_id
- FROM
- (
- Select parent_id, sum(ebp_coeff) as ebp_coeff
- FROM
- (
- Select parent_id, ebp_coeff
- FROM %
- WHERE parent_id NOT IN
- (
- Select parent_id FROM % WHERE lmc_nid IS NOT NULL
- ) and id <> 1
- ) m
- GROUP BY m.parent_id
- ) s
- LEFT JOIN % p
- ON p.id = s.parent_id
- WHERE p.ebp_coeff < s.ebp_coeff;',
- tree_table_name,
- tree_table_name,
- tree_table_name
- )
- INTO curstmt;
-
- EXECUTE curstmt;
-
- EXECUTE 'SELECT parent_id FROM selected_parent_ids_ebp LIMIT 1;'
- INTO num_parent_ids;
-
- IF (num_parent_ids IS NULL) THEN
- EXIT;
- END IF;
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
- SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
- EXECUTE '
- DROP TABLE IF EXISTS ' || swap_tree_table || ';
- CREATE TABLE ' || swap_tree_table || ' AS
- SELECT * FROM ' || tree_table_name || '
- WHERE parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)
- AND id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';
-
- EXECUTE '
- INSERT INTO ' || swap_tree_table || '
- SELECT
- id, tree_location, feature, probability, ebp_coeff, max_class,
- scv, live, num_of_samples, parent_id, NULL, NULL, is_cont,
- split_value, tid, dp_ids
- FROM ' || tree_table_name || '
- WHERE
- id IN (SELECT parent_id FROM selected_parent_ids_ebp) AND
- parent_id NOT IN (SELECT parent_id FROM selected_parent_ids_ebp)';
-
- EXECUTE '
- DROP TABLE IF EXISTS ' || tree_table_name;
- PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, tree_table_name);
-!>, <!
- SELECT MADLIB_SCHEMA.__format
- (
- 'DELETE FROM %
- WHERE parent_id IN
- (SELECT parent_id FROM selected_parent_ids_ebp)',
- tree_table_name
- )
- INTO curstmt;
-
- EXECUTE curstmt;
-
- SELECT MADLIB_SCHEMA.__format
- (
- 'UPDATE %
- SET lmc_nid = NULL, lmc_fval = NULL
- WHERE id IN
- (SELECT parent_id FROM selected_parent_ids_ebp)',
- tree_table_name
- )
- INTO curstmt;
-
- EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
- END LOOP;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief Generate the final trained tree.
- *
- * @param result_tree_table_name The name of the table containing the tree.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__generate_final_tree
- (
- result_tree_table_name TEXT
- )
-RETURNS void AS $$
-DECLARE
- tree_size INTEGER;
- curstmt TEXT;
- num_redundant_nodes INTEGER;
- swap_tree_table TEXT;
-BEGIN
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
- SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
- EXECUTE '
- DROP TABLE IF EXISTS ' || swap_tree_table || ';
- CREATE TABLE ' || swap_tree_table || ' AS
- SELECT * FROM ' || result_tree_table_name || '
- WHERE COALESCE(num_of_samples,0) != 0
- AND id NOT IN (SELECT parent_id FROM ' || result_tree_table_name || ' GROUP BY parent_id)';
-
- EXECUTE '
- INSERT INTO ' || swap_tree_table || '
- SELECT
- k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
- k.scv, k.live, k.num_of_samples, k.parent_id, g.lmc_nid, g.lmc_fval, k.is_cont,
- k.split_value, k.tid, k.dp_ids
- FROM ' || result_tree_table_name || ' k INNER JOIN
- (
- SELECT parent_id,
- min(id) as lmc_nid,
- min(tree_location[array_upper(tree_location,1)])
- as lmc_fval
- FROM ' || result_tree_table_name || '
- GROUP BY parent_id
- ) g
- ON k.id = g.parent_id
- WHERE COALESCE(k.num_of_samples, 0) != 0';
-
- EXECUTE '
- DROP TABLE IF EXISTS ' || result_tree_table_name;
- PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
- EXECUTE ' DELETE FROM ' || result_tree_table_name ||
- ' WHERE COALESCE(num_of_samples,0) = 0';
-
- -- for each node, find the left most child node id and the feature value,
- -- and update the node's lmc_nid and lmc_fval column
- SELECT MADLIB_SCHEMA.__format
- (
- 'UPDATE % k
- SET lmc_nid = g.lmc_nid, lmc_fval = g.lmc_fval
- FROM
- (
- SELECT parent_id,
- min(id) as lmc_nid,
- min(tree_location[array_upper(tree_location,1)])
- as lmc_fval
- FROM %
- GROUP BY parent_id
- ) g
- WHERE k.id = g.parent_id',
- ARRAY[
- result_tree_table_name,
- result_tree_table_name
- ]
- )
- INTO curstmt;
- EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
- /*
- * For a certain node, if all of its children are leaf nodes and have the
- * same class label, we can safely remove its children. After removal, we
- * should apply the same operation to the new leaf nodes until no nodes
- * meet this criterion.
- */
- LOOP
- EXECUTE 'DROP TABLE IF EXISTS trim_tree_aux_table CASCADE';
- -- Find nodes whose children should be removed.
- curstmt = MADLIB_SCHEMA.__format
- (
- 'CREATE TEMP TABLE trim_tree_aux_table AS
- SELECT parent_id FROM
- (
- SELECT parent_id, count(distinct max_class) as class_count
- FROM %
- WHERE parent_id IN
- (
- SELECT parent_id FROM %
- WHERE parent_id NOT IN
- (
- SELECT parent_id
- FROM %
- WHERE lmc_nid IS NOT NULL
- ) and parent_id <> 0
- )
- GROUP BY parent_id
- ) l
- where l.class_count=1
- m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (parent_id)')',
- ARRAY[
- result_tree_table_name,
- result_tree_table_name,
- result_tree_table_name
- ]
- );
- EXECUTE curstmt;
-
- EXECUTE 'SELECT count(*) FROM trim_tree_aux_table'
- INTO num_redundant_nodes;
-
- IF (num_redundant_nodes <= 0) THEN
- EXIT;
- END IF;
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
- SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
- EXECUTE '
- DROP TABLE IF EXISTS ' || swap_tree_table || ';
- CREATE TABLE ' || swap_tree_table || ' AS
- SELECT * FROM ' || result_tree_table_name || '
- WHERE parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)
- AND id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';
-
- EXECUTE '
- INSERT INTO ' || swap_tree_table || '
- SELECT
- k.id, k.tree_location, k.feature, k.probability, k.ebp_coeff, k.max_class,
- k.scv, k.live, k.num_of_samples, k.parent_id, NULL, NULL, k.is_cont,
- k.split_value, k.tid, k.dp_ids
- FROM ' || result_tree_table_name || ' k INNER JOIN
- (
- SELECT parent_id FROM trim_tree_aux_table
- ) g
- ON k.id = g.parent_id
- WHERE k.parent_id NOT IN (SELECT parent_id FROM trim_tree_aux_table)';
-
- EXECUTE '
- DROP TABLE IF EXISTS ' || result_tree_table_name;
- PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
- -- Delete the found redundant nodes.
- curstmt = MADLIB_SCHEMA.__format
- (
- '
- DELETE FROM % t
- WHERE t.parent_id IN
- (SELECT parent_id FROM trim_tree_aux_table)',
- ARRAY[
- result_tree_table_name
- ]
- );
- EXECUTE curstmt;
-
- -- Set the nodes, whose children are removed, to be leaf nodes.
- curstmt = MADLIB_SCHEMA.__format
- (
- 'UPDATE % k
- SET lmc_nid = NULL, lmc_fval = NULL
- FROM
- (
- SELECT parent_id FROM trim_tree_aux_table
- ) g
- WHERE k.id = g.parent_id',
- ARRAY[
- result_tree_table_name
- ]
- );
- EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
- END LOOP;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * The UDT for the training result.
- *
- * num_of_samples It means how many records there exists in the
- * training set.
- * features_per_node The number of features chosen for each tree.
- * num_tree_nodes The number of tree nodes.
- * max_tree_depth The max tree depth.
- * calc_acc_time Total time of calculating acc.
- * calc_pre_time Time of preprocessing when calculating acc.
- * update_time Total time of updating operation after found
- * the best time.
- * update_best Time of updating the best splits' information.
- * update_child Time of generating the child nodes.
- * update_nid Time of updating the assigned node IDs.
- * scv_acs_time Time of calculating the best splits.
- * prune_time Time of tree pruning.
- *
- */
-DROP TYPE IF EXISTS MADLIB_SCHEMA.__train_result CASCADE;
-CREATE TYPE MADLIB_SCHEMA.__train_result AS
-(
- num_of_samples BIGINT,
- features_per_node INT,
- num_tree_nodes INT,
- max_tree_depth INT,
- calc_acc_time INTERVAL,
- calc_pre_time INTERVAL,
- update_time INTERVAL,
- update_best INTERVAL,
- update_child INTERVAL,
- update_nid INTERVAL,
- scv_acs_time INTERVAL,
- prune_time INTERVAL
-);
-
-
-/*
- * @brief The function samples a set of integer values between low and high.
- *
- * @param num_of_samples The number of records to be sampled.
- * @param low The low limit of sampled values.
- * @param high The high limit of sampled values.
- *
- * @return A set of integer values sampled randomly between [low, high].
- *
- */
-DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__sample_within_range
- (
- BIGINT,
- BIGINT,
- BIGINT
- )CASCADE;
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_within_range
- (
- num_of_samples BIGINT,
- low BIGINT,
- high BIGINT
- )
-RETURNS SETOF BIGINT
-AS 'MODULE_PATHNAME', 'dt_sample_within_range'
-LANGUAGE C STRICT VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-
-
-/*
- * @brief The function samples with replacement from source table and store
- * the results to target table.
- *
- * In this function, we firstly calculate how many samples should be
- * generated in each segment. Then, we let those segments sample with
- * replacement between the maximum ID and minimum ID of the source table
- * in parallel and assign samples to different trees.
- *
- * If there are gaps in the ID column of the source table, we sample
- * extra records in proportion to the number of gaps. At last, we remove
- * these invalid samples with an inner join operation with the source
- * table. Since we target big data, this strategy works quite well.
- *
- * @param num_of_tree The number of trees to be trained.
- * @param size_per_tree The number of records to be sampled for each tree.
- * @param src_table The name of the table to be sampled from.
- * @param target_table The name of the table used to store the results.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__sample_with_replacement
- (
- num_of_tree INT,
- size_per_tree BIGINT,
- src_table TEXT,
- target_table TEXT
- )
-RETURNS VOID AS $$
-DECLARE
- segment_num INT;
- sample_per_seg BIGINT;
- sample_ratio FLOAT8;
- record_num FLOAT8;
- min_id BIGINT;
- max_id BIGINT;
- range FLOAT8;
- stmt TEXT;
-BEGIN
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__POSTGRESQL__!>, <!
- -- fix the segment number to 1 for PG
- segment_num = 1;
-!>, <!
- -- get the segment number
- SELECT COUNT(distinct content) FROM gp_segment_configuration
- WHERE content<>-1 INTO segment_num;
-!>)
-m4_changequote(<!`!>, <!'!>)
-
-
- DROP TABLE IF EXISTS auxiliary_segment_table;
- CREATE TEMP TABLE auxiliary_segment_table
- (
- segment_id INT
- ) m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY(segment_id)');
-
- -- Insert segment_num of records distributed by segment id
- EXECUTE 'INSERT INTO auxiliary_segment_table
- SELECT generate_series(1,'||segment_num||');';
-
- EXECUTE 'SELECT max(id),min(id), count(id) as record_num
- FROM '||src_table||';' INTO max_id,min_id,record_num;
- range=max_id-min_id+1;
-
- -- compute the sample ratio
- sample_ratio= range/record_num;
-
- -- compute how many records should be sampled by each segment
- sample_per_seg=((sample_ratio*num_of_tree*size_per_tree)/segment_num)::BIGINT;
-
- -- add the weight field
-
- IF (range > record_num) THEN
- -- remove those invalid samples with join operation
- stmt = MADLIB_SCHEMA.__format
- (
- 'INSERT INTO %(id, tid, nid, weight)
- SELECT record_id,
- tid AS tid,
- tid AS nid,
- count(*) AS weight
- FROM
- (
- SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
- MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
- FROM auxiliary_segment_table
- ) t,
- % k
- WHERE t.record_id=k.id
- GROUP BY record_id, tid, nid',
- ARRAY[
- target_table,
- sample_per_seg::TEXT,
- min_id::TEXT,
- max_id::TEXT,
- sample_per_seg::TEXT,
- num_of_tree::TEXT,
- src_table
- ]
- );
- ELSE
- stmt = MADLIB_SCHEMA.__format
- (
- 'INSERT INTO %(id, tid, nid, weight)
- SELECT record_id,
- tid AS tid,
- tid AS nid,
- count(*) AS weight
- FROM
- (
- SELECT MADLIB_SCHEMA.__sample_within_range(%, %, %) AS record_id,
- MADLIB_SCHEMA.__sample_within_range(%, 1, %) AS tid
- FROM auxiliary_segment_table
- ) t
- GROUP BY record_id, tid, nid',
- ARRAY[
- target_table,
- sample_per_seg::TEXT,
- min_id::TEXT,
- max_id::TEXT,
- sample_per_seg::TEXT,
- num_of_tree::TEXT
- ]
- );
- END IF;
-
- EXECUTE stmt;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief This function trains a decision tree or random forest.
- *
- * @param split_criterion This parameter specifies which split criterion
- * should be used for tree construction and
- * pruning. The valid values are infogain,
- * gainratio, and gini.
- * @param num_trees Total number of trees to be trained.
- * @param features_per_node Total number of features used to compute split
- * gain for each node.
- * @param training_table_name The name of the table/view with the source data.
- * @param training_table_meta The name of the table with the meta data.
- * @param result_tree_table_name The name of the table where the resulting
- * DT/RF will be stored.
- * @param validation_table_name The validation table used for pruning tree.
- * @param id_col_name The name of the column containing id of each point.
- * @param class_col_name The name of the column containing correct class
- * of each point.
- * @param confidence_level A statistical confidence interval of the
- * resubstitution error.
- * @param max_tree_depth Maximum decision tree depth.
- * @param node_prune_threshold Specifies the minimum number of samples required
- * in a child node.
- * @param node_split_threshold Specifies the minimum number of samples required
- * in a node in order for a further split
- * to be possible.
- * @param sampling_needed Whether enabling the sampling functionality.
- * @param h2hmv_routine_id Specifies how to handle missing values.
- * 1 ignore, 2 explicit.
- * @param verbosity > 0 means this function runs in verbose mode.
- *
- * @return The record including training related information.
- * Details please refer to the UDT: MADLIB_SCHEMA.__train_result.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__train_tree
- (
- split_criterion TEXT,
- num_trees INT,
- features_per_node INT,
- training_table_name TEXT,
- training_table_meta TEXT,
- result_tree_table_name TEXT,
- validation_table_name TEXT,
- id_col_name TEXT,
- class_col_name TEXT,
- confidence_level FLOAT,
- max_tree_depth INT,
- sampling_percentage FLOAT,
- node_prune_threshold FLOAT,
- node_split_threshold FLOAT,
- sampling_needed BOOLEAN,
- h2hmv_routine_id INT,
- verbosity INT
- )
-RETURNS MADLIB_SCHEMA.__train_result AS $$
-DECLARE
- num_live_nodes INT;
- max_nid INT;
- location INT[];
- temp_location INT[];
- num_classes INT;
- answer record;
- location_size INT;
- begin_func_exec TIMESTAMP;
- begin_find_best TIMESTAMP;
- scv_acs_time INTERVAL;
- begin_data_transfer TIMESTAMP;
- begin_update_best TIMESTAMP;
- begin_update_child TIMESTAMP;
- begin_update_nid TIMESTAMP;
- calc_update_best INTERVAL;
- calc_update_child INTERVAL;
- calc_update_nid INTERVAL;
- ins_upd_time INTERVAL;
- begin_olap_acs TIMESTAMP;
- calc_acc_time INTERVAL;
- calc_pre_time INTERVAL;
- calc_olap_time INTERVAL;
- begin_bld_assoc TIMESTAMP;
- bld_assoc_time INTERVAL;
- begin_prune TIMESTAMP;
- prune_time INTERVAL;
- total_size FLOAT;
- sc_code INT := 1;
- curstmt TEXT := '';
- grow_tree INT := max_tree_depth;
- ret MADLIB_SCHEMA.__train_result;
- curr_level INT := 1;
- dp_ids INT[];
- dp_ids_text TEXT;
- instance_time MADLIB_SCHEMA.__gen_acc_time;
- tr_table_index INT := 1;
- tr_tables TEXT[] := '{tr_assoc_ping, tr_assoc_pong}';
- cur_tr_table TEXT := 'tr_assoc_ping';
- need_analyze BOOL := 't'::BOOL;
- attr_count INT;
- swap_tree_table TEXT;
- tree_tbl_rows TEXT;
-BEGIN
- -- record the time costed in different steps when training
- begin_func_exec = clock_timestamp();
- scv_acs_time = begin_func_exec - begin_func_exec;
- calc_olap_time = scv_acs_time;
- calc_acc_time = scv_acs_time;
- calc_pre_time = scv_acs_time;
- ins_upd_time = scv_acs_time;
- calc_update_best = scv_acs_time;
- calc_update_child = scv_acs_time;
- calc_update_nid = scv_acs_time;
- bld_assoc_time = scv_acs_time;
- prune_time = scv_acs_time;
-
- IF(split_criterion = 'infogain') THEN
- sc_code = 1;
- ELSIF (split_criterion = 'gainratio') THEN
- sc_code = 2;
- ELSIF (split_criterion = 'gini') THEN
- sc_code = 3;
- ELSE
- RAISE EXCEPTION '%', 'Invalid split criterion!';
- END IF;
-
- num_classes = MADLIB_SCHEMA.__num_of_class(training_table_meta);
-
- IF(verbosity > 0) THEN
- RAISE INFO 'NUMBER OF CLASSES IN THE TRAINING SET %', num_classes;
- END IF;
-
- IF(num_classes < 2) THEN
- RAISE EXCEPTION 'the number of classes must be greater than 2';
- END IF;
-
- curstmt = MADLIB_SCHEMA.__format
- (
- 'SELECT
- count(*)
- FROM %
- WHERE column_type=''f''',
- training_table_meta
- );
- EXECUTE curstmt INTO attr_count;
-
- -- generate the horizontal table for updating assinged node IDs
- PERFORM MADLIB_SCHEMA.__gen_horizontal_encoded_table
- (
- 'tmp_dt_hori_table',
- training_table_name,
- attr_count,
- verbosity
- );
-
- EXECUTE 'SELECT count(*) FROM tmp_dt_hori_table' INTO total_size;
-
- IF(verbosity > 0) THEN
- RAISE INFO 'INPUT TABLE SIZE: %', total_size;
- END IF;
-
- begin_bld_assoc = clock_timestamp();
- cur_tr_table = tr_tables[tr_table_index];
-
- -- The table of tr_assoc holds the information of which records are
- -- used during training for each tree.
- -- It has four columns.
- -- id -- The id of one record.
- -- tid -- The id of a tree.
- -- nid -- The id of a node in a tree.
- -- weight -- The times a record is assigned to a node.
- IF (sampling_needed) THEN
- PERFORM MADLIB_SCHEMA.__sample_with_replacement
- (
- num_trees,
- round(sampling_percentage * total_size)::BIGINT,
- 'tmp_dt_hori_table',
- cur_tr_table
- );
- ELSE
- curstmt = MADLIB_SCHEMA.__format
- (
- 'INSERT INTO %
- SELECT id, 1 as tid, 1 as nid, 1 as weight
- FROM %',
- ARRAY[
- cur_tr_table,
- 'tmp_dt_hori_table'
- ]
- );
- EXECUTE curstmt;
- END IF;
-
- -- analyze ping
- EXECUTE 'ANALYZE ' || cur_tr_table;
- bld_assoc_time = clock_timestamp() - begin_bld_assoc;
-
- -- generate the root node for all trees.
- -- the generated numbers are the same for the two generate_series
- SELECT MADLIB_SCHEMA.__format
- (
- 'INSERT INTO %
- (id, tree_location, feature, probability, max_class,scv,
- live, num_of_samples, parent_id, tid)
- SELECT generate_series(1, %), ARRAY[0], 0, 1, 1, 1, 1, 0, 0,
- generate_series(1, %)',
- ARRAY[
- result_tree_table_name,
- num_trees::TEXT,
- num_trees::TEXT
- ]
- ) INTO curstmt;
-
- EXECUTE curstmt;
-
- max_nid = num_trees;
- location_size = 0;
-
-
- LOOP
- EXECUTE 'SELECT COUNT(id) FROM ' || result_tree_table_name ||
- ' WHERE live > 0 AND array_upper(tree_location,1)='||
- curr_level||';' INTO num_live_nodes;
-
- IF (num_live_nodes < 1) THEN
- IF(verbosity > 0) THEN
- RAISE INFO 'EXIT: %', 'no live nodes to split';
- END IF;
-
- EXIT;
- END IF;
-
- IF (verbosity > 0) THEN
- RAISE INFO 'Running on level:%', curr_level;
- END IF;
-
- begin_olap_acs = clock_timestamp();
-
- instance_time = MADLIB_SCHEMA.__gen_acc
- (
- training_table_name,
- training_table_meta,
- result_tree_table_name,
- cur_tr_table,
- 'sf_assoc',
- features_per_node,
- num_classes,
- sampling_needed,
- verbosity
- );
-
- IF (h2hmv_routine_id=1) THEN
- -- For ignore, we need the true size of nodes to handle the
- -- missing values.
- TRUNCATE node_size_aux;
-
- curstmt = MADLIB_SCHEMA.__format
- (
- 'INSERT INTO node_size_aux
- SELECT tr.tid, tr.nid, sum(weight) as count
- FROM % tr
- GROUP BY tr.tid, tr.nid',
- cur_tr_table
- );
-
- EXECUTE curstmt;
- END IF;
-
- calc_pre_time = calc_pre_time + instance_time.calc_pre_time;
- calc_acc_time = calc_acc_time + instance_time.calc_acc_time;
- calc_olap_time = calc_olap_time + (clock_timestamp() - begin_olap_acs);
-
- curr_level = curr_level + 1;
-
- begin_find_best = clock_timestamp();
-
- PERFORM MADLIB_SCHEMA.__find_best_split
- (
- 'training_instance',
- confidence_level,
- training_table_meta,
- sc_code,
- grow_tree,
- 'find_best_answer_table',
- h2hmv_routine_id,
- num_classes
- );
- IF (verbosity > 0) THEN
- RAISE INFO 'find best time at this level:%',
- clock_timestamp() - begin_find_best;
- END IF;
- grow_tree = grow_tree - 1;
-
- scv_acs_time = scv_acs_time +
- (clock_timestamp() - begin_find_best);
- begin_data_transfer = clock_timestamp();
- begin_update_best = clock_timestamp();
-
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
- SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
- EXECUTE '
- DROP TABLE IF EXISTS ' || swap_tree_table || ';
- CREATE TABLE ' || swap_tree_table || ' AS
- SELECT t.* FROM ' || result_tree_table_name || ' t left join find_best_answer_table c
- ON t.id = c.node_id AND t.tid = c.tid
- WHERE node_id is NULL';
-
- EXECUTE '
- INSERT INTO ' || swap_tree_table || '
- SELECT
- k.id, k.tree_location, c.feature, c.probability, c.ebp_coeff, c.max_class,
- c.max_scv, 0, c.node_size, k.parent_id, k.lmc_nid, k.lmc_fval, c.is_cont,
- c.split_value, k.tid, k.dp_ids
- FROM ' || result_tree_table_name || ' k inner join find_best_answer_table c
- on k.id=c.node_id AND k.tid=c.tid';
-
- EXECUTE '
- DROP TABLE IF EXISTS ' || result_tree_table_name;
- PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
- -- We get the calculation result for current level.
- -- Update the nodes of previous level firstly.
- SELECT MADLIB_SCHEMA.__format
- (
- 'UPDATE % t
- SET feature = c.feature,
- probability = c.probability,
- max_class = c.max_class,
- scv = c.max_scv,
- ebp_coeff = c.ebp_coeff,
- num_of_samples = c.node_size,
- live = 0,
- is_cont = c.is_cont,
- split_value = c.split_value
- FROM find_best_answer_table c
- WHERE t.id=c.node_id AND t.tid=c.tid',
- ARRAY[
- result_tree_table_name::TEXT
- ]
- ) INTO curstmt;
-
- EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
- calc_update_best = calc_update_best +
- (clock_timestamp() - begin_update_best);
- begin_update_child = clock_timestamp();
-
- curstmt=
- MADLIB_SCHEMA.__format(
- 'INSERT INTO %(id, tree_location, feature, probability,
- max_class, scv, live, parent_id, tid, dp_ids)
- SELECT %+row, array_append(tree_location, fval),
- 0, 1, 1, 1, %, ans.node_id, ans.tid,
- CASE when(NOT ans.is_cont) then
- array_append( dp_ids, ans.feature)
- ELSE
- dp_ids
- END
- FROM % tree,
- (
- SELECT *,
- row_number()
- OVER (ORDER BY l.tid, l.node_id, l.fval) AS row
- FROM
- (
- SELECT *,
- CASE WHEN (is_cont) THEN
- generate_series(1,2)
- ELSE
- generate_series(1, distinct_features)
- END AS fval
- FROM
- find_best_answer_table
- WHERE live>0 AND coalesce(feature, 0) <> 0
- AND node_size >= % AND node_size >= %
- ) l
- ) ans
- WHERE tree.id=ans.node_id and tree.tid=ans.tid;',
- ARRAY[
- result_tree_table_name,
- (max_nid)::TEXT,
- curr_level::TEXT,
- result_tree_table_name,
- (total_size * node_prune_threshold)::TEXT,
- (total_size * node_split_threshold)::TEXT
- ]
- );
- IF(verbosity > 0) THEN
- RAISE INFO 'Generate Child Nodes:%', curstmt;
- END IF;
-
- EXECUTE curstmt;
-
- EXECUTE 'SELECT max(id) FROM '||result_tree_table_name INTO max_nid;
-
- IF(verbosity > 0) THEN
- RAISE INFO 'Max nid:%, level:%', max_nid, curr_level;
- END IF;
-
- -- insert the leftmost child node id and relevant info
- -- to the assoc_aux table, so that we will make use of this
- -- info to update the assigned nid the samples belong to
- -- the current node whose id is answer.node_id.
- SELECT MADLIB_SCHEMA.__format
- (
- 'INSERT INTO assoc_aux
- (nid, fid, lmc_id, svalue, is_cont)
- SELECT t.id, t.feature, min(l.id),
- t.split_value, t.is_cont
- FROM
- (SELECT id, parent_id
- FROM %
- WHERE array_upper(tree_location,1)=%) l,
- % t
- WHERE l.parent_id=t.id
- GROUP BY t.id, t.feature, t.split_value, t.is_cont;',
- ARRAY[
- result_tree_table_name,
- curr_level::TEXT,
- result_tree_table_name
- ]
- ) INTO curstmt;
-
- IF(verbosity > 0) THEN
- RAISE INFO 'Update lmc_child Info:%', curstmt;
- END IF;
-
- EXECUTE curstmt;
-
- -- delete the unused nodes on the previous level
- -- delete those nodes with a size less than node_prune_threshold
- -- node_prune_threshold will not apply to root node,
- -- the level is 1 (curr_level - 1 = 1);
-
- IF (curr_level > 2) THEN
-m4_changequote(`<!', `!>')
-m4_ifdef(<!__HAWQ__!>, <!
- SELECT MADLIB_SCHEMA.__unique_string() INTO swap_tree_table;
- EXECUTE '
- DROP TABLE IF EXISTS ' || swap_tree_table || ';
- CREATE TABLE ' || swap_tree_table || ' AS
- SELECT t.* FROM ' || result_tree_table_name || ' t
- WHERE (t.num_of_samples is NULL OR t.num_of_samples >= ' || total_size * node_prune_threshold || ')
- AND t.live != ' || curr_level - 1;
- EXECUTE '
- DROP TABLE IF EXISTS ' || result_tree_table_name;
- PERFORM MADLIB_SCHEMA.__rename_table(swap_tree_table, result_tree_table_name);
-!>, <!
- curstmt = MADLIB_SCHEMA.__format
- (
- 'DELETE FROM % t
- WHERE t.num_of_samples < % OR live = %;',
- ARRAY[
- result_tree_table_name::TEXT,
- (total_size * node_prune_threshold)::TEXT,
- (curr_level - 1)::TEXT
- ]
- );
- EXECUTE curstmt;
-!>)
-m4_changequote(<!`!>, <!'!>)
- END IF;
-
- calc_update_child = calc_update_child + (clock_timestamp() - begin_update_child);
- begin_update_nid = clock_timestamp();
-
- -- update the assigned node id for each sample on the current level
- tr_table_index = (tr_table_index % 2) + 1;
- curstmt = MADLIB_SCHEMA.__format
- (
- 'INSERT INTO % (id, nid, tid, weight)
- SELECT
- tr.id,
- au.lmc_id - 1 +
- CASE WHEN (au.is_cont) THEN
- CASE WHEN (svalue < vt.fvals[au.fid]) THEN
- 2
- ELSE
- 1
- END
- ELSE
- vt.fvals[au.fid]::INT
- END AS nid,
- tid, weight
- FROM % tr, % vt, assoc_aux au
- WHERE tr.nid = au.nid AND vt.id = tr.id AND vt.fvals[au.fid] IS NOT NULL',
- ARRAY[
- tr_tables[tr_table_index],
- cur_tr_table,
- 'tmp_dt_hori_table'
- ]
- );
- IF (verbosity > 0) THEN
- RAISE INFO '%', curstmt;
- END IF;
-
- EXECUTE curstmt;
- EXECUTE 'TRUNCATE ' || cur_tr_table;
- cur_tr_table = tr_tables[tr_table_index];
-
- IF (need_analyze) THEN
- -- analyze pong table
- EXECUTE 'ANALYZE ' || cur_tr_table;
- need_analyze = 'f'::BOOL;
- END IF;
-
- EXECUTE 'TRUNCATE assoc_aux';
-
- calc_update_nid = calc_update_nid + (clock_timestamp() - begin_update_nid);
-
- ins_upd_time = ins_upd_time +
- (clock_timestamp() - begin_data_transfer);
- IF(verbosity > 0) THEN
- RAISE INFO 'computation time in this level:%',
- (clock_timestamp() - begin_find_best);
- END IF;
-
- END LOOP;
-
- PERFORM MADLIB_SCHEMA.__generate_final_tree(result_tree_table_name);
-
- begin_prune = clock_timestamp();
- IF (confidence_level < 100.0) THEN
- PERFORM MADLIB_SCHEMA.__ebp_prune_tree(result_tree_table_name);
- END IF;
-
- IF (validation_table_name IS NOT NULL) THEN
- PERFORM MADLIB_SCHEMA.__rep_prune_tree
- (
- result_tree_table_name,
- validation_table_name ,
- num_classes
- );
- END IF;
- prune_time = clock_timestamp() - begin_prune;
-
- IF(verbosity > 0) THEN
- RAISE INFO 'time of sampling with replacement: %', bld_assoc_time;
- RAISE INFO 'time of finding best and calculating ACS: %', scv_acs_time;
- RAISE INFO 'time of calculating ACC: %', calc_acc_time;
- RAISE INFO 'time of Insert/update operation: %', ins_upd_time;
- RAISE INFO 'time of pruning: %', prune_time;
- RAISE INFO 'time of training: %', clock_timestamp() - begin_func_exec;
- END IF;
-
- SELECT MADLIB_SCHEMA.__format
- (
- 'SELECT COUNT(id), max(array_upper(tree_location, 1))
- FROM %',
- ARRAY[
- result_tree_table_name
- ]
- ) INTO curstmt;
-
- EXECUTE curstmt INTO ret.num_tree_nodes, ret.max_tree_depth;
-
- ret.features_per_node = features_per_node;
- ret.num_of_samples = total_size;
- ret.calc_acc_time = calc_acc_time;
- ret.calc_pre_time = calc_pre_time;
- ret.update_time = ins_upd_time;
- ret.update_best = calc_update_best;
- ret.update_child = calc_update_child;
- ret.update_nid = calc_update_nid;
- ret.scv_acs_time = scv_acs_time;
- ret.prune_time = prune_time;
-
- RETURN ret;
-END
-$$ LANGUAGE PLPGSQL
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-
-
-/*
- * @brief This is an internal function for displaying one tree node in human
- * readable format. It is the step function of aggregation named
- * __display_tree_aggr.
- *
- * @param state This variable is used to store the accumulated tree
- * display information.
- * @param depth The depth of this node.
- * @param is_cont Whether the feature used to split is continuous.
- * @param feat_name The name of the feature used to split.
- * @param curr_val The value of the splitting feature for this node.
- * @param split_value For continuous feature, it specifies the split value.
- * Otherwise, it is of no meaning.
- * @param max_prob For those elements in this node, the probability that
- * an element belongs to the max_class.
- * @param max_class The class ID with the largest number of elements
- * for those elements in this node.
- * @param num_of_samples Total count of samples in this node.
- *
- * @return It returns the text containing the information of human
- * readable information for trees.
- *
- */
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__display_node_sfunc
- (
- state TEXT,
- depth INT,
- is_cont BOOLEAN,
- feat_name TEXT,
- curr_val TEXT,
- split_value FLOAT8,
- max_prob FLOAT8,
- max_class TEXT,
- num_of_samples INT
- )
-RETURNS TEXT AS $$
-DECLARE
- ret TEXT := '';
- index INT;
-BEGIN
- -- We add indentation based on the depth.
- FOR index IN 0..depth LOOP
- ret = ret || ' ';
- END LOOP;
-
- IF (depth > 0) THEN
- ret = ret ||coalesce(feat_name,'null')||': ';
- -- For continuous features, there are two splits.
- -- We will mark curr_val to 1 for '<='. Otherwise,
- -- we will mark curr_val to 2.
- IF (is_cont) THEN
- IF (curr_val::INT = 1) THEN
- ret = ret || ' <= ';
- ELSE
- ret = ret || ' > ';
- END IF;
- ret = ret||coalesce(split_value,0)||' ';
- ELSE
- ret = ret||' = '||coalesce(curr_val,'null')||' ';
- END IF;
- ELSE
- ret = ret||'Root Node ';
- END IF;
-
- ret = ret ||
- ' : class(' ||
- coalesce(max_class,null) ||
- ') num_elements(' ||
- coalesce(
<TRUNCATED>