You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/03/13 20:57:34 UTC

[12/50] [abbrv] incubator-madlib git commit: RF: Fixes the online help and example

RF: Fixes the online help and example


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e384c1fc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e384c1fc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e384c1fc

Branch: refs/heads/latest_release
Commit: e384c1fc7bb27b7c2401b17b6049cee1374fee1a
Parents: 498c559
Author: Orhan Kislal <ok...@pivotal.io>
Authored: Mon Jan 23 15:45:08 2017 -0800
Committer: Orhan Kislal <ok...@pivotal.io>
Committed: Mon Jan 23 15:45:08 2017 -0800

----------------------------------------------------------------------
 .../recursive_partitioning/random_forest.py_in  | 83 +++++++++++---------
 1 file changed, 47 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e384c1fc/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index e006a34..0eb5985 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -103,6 +103,10 @@ SELECT {schema_madlib}.forest_train(
                                 is an positive integer with the default 0.
     verbose,                -- Boolean, whether to print more info,
                               default is False
+    sample_ratio            -- Double precision, in the range of (0, 1], default: 1
+                                If sample_ratio is less than 1, a bootstrap sample
+                                size smaller than the data table is expected to be
+                                used for training each tree in the forest.
 );
 
 ------------------------------------------------------------
@@ -175,44 +179,51 @@ it has the following columns:
 ------------------------------------------------------------
                         EXAMPLE
 ------------------------------------------------------------
-DROP TABLE IF EXISTS dummy_dt_con_src CASCADE;
-CREATE TABLE dummy_dt_con_src (
-    id  INTEGER,
-    cat INTEGER[],
-    con FLOAT8[],
-    y   FLOAT8
+DROP TABLE IF EXISTS dt_golf;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    windy text,
+    class text
 );
 
-INSERT INTO dummy_dt_src VALUES
-(1, '{0}'::INTEGER[], ARRAY[0], 0.5),
-(2, '{0}'::INTEGER[], ARRAY[1], 0.5),
-(3, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(4, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(5, '{0}'::INTEGER[], ARRAY[4], 0.5),
-(6, '{0}'::INTEGER[], ARRAY[5], 0.1),
-(7, '{0}'::INTEGER[], ARRAY[6], 0.1),
-(8, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(9, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(10, '{1}'::INTEGER[], ARRAY[9], 0.1);
-(11, '{1}'::INTEGER[], ARRAY[9], 0.1);
-
-DROP TABLE IF EXISTS forest_out, forest_out_summary;
-SELECT madlib.forest_train(
-    'dummy_dt_src',
-    'forest_out',
-    'id',
-    'y',
-    'cat, con',
-    '',
-    'mse',
-    NULL::Text,
-    NULL::Text,
-    3,
-    2,
-    1,
-    5);
-
-SELECT madlib.forest_display('forest_out');
+INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
+(1, 'sunny', 85, 85, 'false', 'Don''t Play'),
+(2, 'sunny', 80, 90, 'true', 'Don''t Play'),
+(3, 'overcast', 83, 78, 'false', 'Play'),
+(4, 'rain', 70, 96, 'false', 'Play'),
+(5, 'rain', 68, 80, 'false', 'Play'),
+(6, 'rain', 65, 70, 'true', 'Don''t Play'),
+(7, 'overcast', 64, 65, 'true', 'Play'),
+(8, 'sunny', 72, 95, 'false', 'Don''t Play'),
+(9, 'sunny', 69, 70, 'false', 'Play'),
+(10, 'rain', 75, 80, 'false', 'Play'),
+(11, 'sunny', 75, 70, 'true', 'Play'),
+(12, 'overcast', 72, 90, 'true', 'Play'),
+(13, 'overcast', 81, 75, 'false', 'Play'),
+(14, 'rain', 71, 80, 'true', 'Don''t Play');
+
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('dt_golf',         -- source table
+    'train_output',    -- output model table
+    'id',              -- id column
+    'class',           -- response
+    '"OUTLOOK", temperature, humidity, windy',   -- features
+    NULL,              -- exclude columns
+    NULL,              -- grouping columns
+    20::integer,       -- number of trees
+    2::integer,        -- number of random features
+    TRUE::boolean,     -- variable importance
+    1::integer,        -- num_permutations
+    8::integer,        -- max depth
+    3::integer,        -- min split
+    1::integer,        -- min bucket
+    10::integer        -- number of splits per continuous variable
+);
+SELECT madlib.get_tree('train_output',1,2,FALSE);
+
         """
     else:
         help_string = "No such option. Use {schema_madlib}.forest_train('usage')"