You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/03/13 20:57:34 UTC
[12/50] [abbrv] incubator-madlib git commit: RF: Fixes the online
help and example
RF: Fixes the online help and example
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e384c1fc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e384c1fc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e384c1fc
Branch: refs/heads/latest_release
Commit: e384c1fc7bb27b7c2401b17b6049cee1374fee1a
Parents: 498c559
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Mon Jan 23 15:45:08 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Mon Jan 23 15:45:08 2017 -0800
----------------------------------------------------------------------
.../recursive_partitioning/random_forest.py_in | 83 +++++++++++---------
1 file changed, 47 insertions(+), 36 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e384c1fc/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index e006a34..0eb5985 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -103,6 +103,10 @@ SELECT {schema_madlib}.forest_train(
is an positive integer with the default 0.
verbose, -- Boolean, whether to print more info,
default is False
+ sample_ratio -- Double precision, in the range of (0, 1], default: 1
+ If sample_ratio is less than 1, a bootstrap sample
+ size smaller than the data table is expected to be
+ used for training each tree in the forest.
);
------------------------------------------------------------
@@ -175,44 +179,51 @@ it has the following columns:
------------------------------------------------------------
EXAMPLE
------------------------------------------------------------
-DROP TABLE IF EXISTS dummy_dt_con_src CASCADE;
-CREATE TABLE dummy_dt_con_src (
- id INTEGER,
- cat INTEGER[],
- con FLOAT8[],
- y FLOAT8
+DROP TABLE IF EXISTS dt_golf;
+CREATE TABLE dt_golf (
+ id integer NOT NULL,
+ "OUTLOOK" text,
+ temperature double precision,
+ humidity double precision,
+ windy text,
+ class text
);
-INSERT INTO dummy_dt_src VALUES
-(1, '{0}'::INTEGER[], ARRAY[0], 0.5),
-(2, '{0}'::INTEGER[], ARRAY[1], 0.5),
-(3, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(4, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(5, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(6, '{0}'::INTEGER[], ARRAY[5], 0.1),
-(7, '{0}'::INTEGER[], ARRAY[6], 0.1),
-(8, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(9, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(10, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(11, '{1}'::INTEGER[], ARRAY[9], 0.1);
-
-DROP TABLE IF EXISTS forest_out, forest_out_summary;
-SELECT madlib.forest_train(
- 'dummy_dt_src',
- 'forest_out',
- 'id',
- 'y',
- 'cat, con',
- '',
- 'mse',
- NULL::Text,
- NULL::Text,
- 3,
- 2,
- 1,
- 5);
-
-SELECT madlib.forest_display('forest_out');
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
+(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
+(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
+(3, 'overcast', 83, 78, 'false', 'Play'),
+(4, 'rain', 70, 96, 'false', 'Play'),
+(5, 'rain', 68, 80, 'false', 'Play'),
+(6, 'rain', 65, 70, 'true', 'Don''t Play'),
+(7, 'overcast', 64, 65, 'true', 'Play'),
+(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
+(9, 'sunny', 69, 70, 'false', 'Play'),
+(10, 'rain', 75, 80, 'false', 'Play'),
+(11, 'sunny', 75, 70, 'true', 'Play'),
+(12, 'overcast', 72, 90, 'true', 'Play'),
+(13, 'overcast', 81, 75, 'false', 'Play'),
+(14, 'rain', 71, 80, 'true', 'Don''t Play');
+
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('dt_golf', -- source table
+ 'train_output', -- output model table
+ 'id', -- id column
+ 'class', -- response
+ '"OUTLOOK", temperature, humidity, windy', -- features
+ NULL, -- exclude columns
+ NULL, -- grouping columns
+ 20::integer, -- number of trees
+ 2::integer, -- number of random features
+ TRUE::boolean, -- variable importance
+ 1::integer, -- num_permutations
+ 8::integer, -- max depth
+ 3::integer, -- min split
+ 1::integer, -- min bucket
+ 10::integer -- number of splits per continuous variable
+);
+SELECT madlib.get_tree('train_output',1,2,FALSE);
+
"""
else:
help_string = "No such option. Use {schema_madlib}.forest_train('usage')"