You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2022/05/12 13:37:43 UTC
[madlib] 03/06: Doc: Update online examples for various supervised learning modules
This is an automated email from the ASF dual-hosted git repository.
okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 26eea8d6d27cb4b096cf9eb92c9ea586b62ace98
Author: Orhan Kislal <ok...@apache.org>
AuthorDate: Thu Apr 28 15:32:29 2022 +0300
Doc: Update online examples for various supervised learning modules
---
src/ports/postgres/modules/convex/mlp.sql_in | 96 +--------
src/ports/postgres/modules/crf/crf.sql_in | 223 ++++++++------------
.../modules/elastic_net/elastic_net.sql_in | 227 ++++++++++++---------
src/ports/postgres/modules/glm/glm.sql_in | 5 +
.../postgres/modules/glm/multiresponseglm.sql_in | 5 +-
src/ports/postgres/modules/glm/ordinal.sql_in | 2 -
.../recursive_partitioning/decision_tree.sql_in | 4 +-
.../modules/regress/clustered_variance.sql_in | 188 ++++++++++++++---
src/ports/postgres/modules/regress/marginal.sql_in | 143 ++++++++-----
src/ports/postgres/modules/regress/robust.sql_in | 4 +-
.../postgres/modules/stats/cox_prop_hazards.sql_in | 12 +-
src/ports/postgres/modules/svm/svm.sql_in | 31 +--
.../modules/validation/cross_validation.sql_in | 14 +-
13 files changed, 512 insertions(+), 442 deletions(-)
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index 7387ae27..bac7d6ce 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -997,6 +997,7 @@ weights for the coefficients when run with warm start.
-# Next, test adam solver for adaptive learning rates. Note that we are using the
minibatched dataset.
<pre class="example">
+DROP TABLE IF EXISTS mlp_model, mlp_model_summary, mlp_model_standardization;
SELECT madlib.mlp_classification(
'iris_data_packed', -- Output table from mini-batch preprocessor
'mlp_model', -- Destination table
@@ -1071,6 +1072,7 @@ the state (grouping column) associated with the model.
-# Prediction based on grouping using the state column:
<pre class="example">
+\\x off
DROP TABLE IF EXISTS mlp_prediction;
SELECT madlib.mlp_predict(
'mlp_model_group', -- Model table
@@ -1215,6 +1217,7 @@ SELECT madlib.mlp_predict(
</pre>
View results:
<pre class="example">
+\\x off
SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (id) ORDER BY id;
</pre>
<pre class="result">
@@ -1346,99 +1349,6 @@ JOIN mlp_regress_prediction USING (id);
(1 row)
</pre>
-<h4>Regression with Grouping and Mini-Batching</h4>
-
--# To use grouping and mini-batching, we must first
-re-run the preprocessor and specify grouping:
-<pre class="example">
-DROP TABLE IF EXISTS lin_housing_packed, lin_housing_packed_summary, lin_housing_packed_standardization;
-SELECT madlib.minibatch_preprocessor('lin_housing', -- Source table
- 'lin_housing_packed', -- Output table
- 'y', -- Dependent variable
- 'x', -- Independent variables
- 'zipcode' -- Group by zipcode
- );
-</pre>
-
--# Train regression model and group the training data by zipcode
-to learn a different model for each zipcode.
-<pre class="example">
-DROP TABLE IF EXISTS mlp_regress_group, mlp_regress_group_summary, mlp_regress_group_standardization;
--- Set seed so results are reproducible
-SELECT setseed(0);
-SELECT madlib.mlp_regression(
- 'lin_housing_packed', -- Source table
- 'mlp_regress_group', -- Desination table
- 'independent_varname', -- Input features
- 'dependent_varname', -- Dependent variable
- ARRAY[25,25], -- Number of units per layer
- 'learning_rate_init=0.001,
- n_iterations=500,
- lambda=0.001,
- tolerance=0', -- Optimizer params
- 'relu', -- Activation function
- NULL, -- Default weight (1)
- FALSE, -- No warm start
- FALSE, -- Not verbose
- 'zipcode' -- Grouping column
-);
-</pre>
-View regression model with grouping:
-<pre class="example">
-\\x on
-SELECT * FROM mlp_regress_group;
-</pre>
-<pre class="result">
--[ RECORD 1 ]--+------------------------------------------------------------------------------------
-zipcode | 200001
-coeff | {-0.193588485849,0.063428493184,-0.30440608833,-0.355695802004,-0.175942716164 ...
-loss | 0.0904009145541
-num_iterations | 500
--[ RECORD 2 ]--+------------------------------------------------------------------------------------
-zipcode | 94016
-coeff | {-0.18965351506,0.0633650963628,-0.302423579808,-0.334367637252,-0.230043593847 ...
-loss | 1.04772100552
-num_iterations | 500
-</pre>
-
--# Prediction using the regression model for each group based on the zipcode:
-<pre class="example">
-DROP TABLE IF EXISTS mlp_regress_prediction;
-SELECT madlib.mlp_predict(
- 'mlp_regress_group', -- Model table
- 'lin_housing', -- Test data table
- 'id', -- Id column in test table
- 'mlp_regress_prediction', -- Output table for predictions
- 'response' -- Output values, not probabilities
- );
-\\x off
-SELECT * FROM lin_housing JOIN mlp_regress_prediction USING (zipcode, id) ORDER BY zipcode, id;
-</pre>
-<pre class="result">
- zipcode | id | x | y | estimated_y
----------+----+----------------------------------------------------------------------------------+------+------------------
- 20001 | 12 | {1,0.11747,12.50,7.870,0,0.5240,6.0090,82.90,6.2267,5,311.0,15.20,396.90,13.27} | 18.9 | 19.2272848285357
- 20001 | 13 | {1,0.09378,12.50,7.870,0,0.5240,5.8890,39.00,5.4509,5,311.0,15.20,390.50,15.71} | 21.7 | 21.3979318641202
- 20001 | 14 | {1,0.62976,0.00,8.140,0,0.5380,5.9490,61.80,4.7075,4,307.0,21.00,396.90,8.26} | 20.4 | 19.7743403979155
- 20001 | 15 | {1,0.63796,0.00,8.140,0,0.5380,6.0960,84.50,4.4619,4,307.0,21.00,380.02,10.26} | 18.2 | 18.7400800902121
- 20001 | 16 | {1,0.62739,0.00,8.140,0,0.5380,5.8340,56.50,4.4986,4,307.0,21.00,395.62,8.47} | 19.9 | 19.6187933144569
- 20001 | 17 | {1,1.05393,0.00,8.140,0,0.5380,5.9350,29.30,4.4986,4,307.0,21.00,386.85,6.58} | 23.1 | 23.3492239648177
- 20001 | 18 | {1,0.78420,0.00,8.140,0,0.5380,5.9900,81.70,4.2579,4,307.0,21.00,386.75,14.67} | 17.5 | 17.0806608347814
- 20001 | 19 | {1,0.80271,0.00,8.140,0,0.5380,5.4560,36.60,3.7965,4,307.0,21.00,288.99,11.69} | 20.2 | 20.1559086626409
- 20001 | 20 | {1,0.72580,0.00,8.140,0,0.5380,5.7270,69.50,3.7965,4,307.0,21.00,390.95,11.28} | 18.2 | 18.6980897920022
- 94016 | 1 | {1,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98} | 24 | 23.9714991250013
- 94016 | 2 | {1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14} | 21.6 | 22.3655180133895
- 94016 | 3 | {1,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03} | 34.7 | 33.8620767428645
- 94016 | 4 | {1,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94} | 33.4 | 35.3094157686524
- 94016 | 5 | {1,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33} | 36.2 | 35.0379122731818
- 94016 | 6 | {1,0.02985,0.00,2.180,0,0.4580,6.4300,58.70,6.0622,3,222.0,18.70,394.12,5.21} | 28.7 | 27.5207943492151
- 94016 | 7 | {1,0.08829,12.50,7.870,0,0.5240,6.0120,66.60,5.5605,5,311.0,15.20,395.60,12.43} | 22.9 | 24.9841422781166
- 94016 | 8 | {1,0.14455,12.50,7.870,0,0.5240,6.1720,96.10,5.9505,5,311.0,15.20,396.90,19.15} | 27.1 | 24.5403994064793
- 94016 | 9 | {1,0.21124,12.50,7.870,0,0.5240,5.6310,100.00,6.0821,5,311.0,15.20,386.63,29.93} | 16.5 | 17.2588278443879
- 94016 | 10 | {1,0.17004,12.50,7.870,0,0.5240,6.0040,85.90,6.5921,5,311.0,15.20,386.71,17.10} | 18.9 | 17.0600407532569
- 94016 | 11 | {1,0.22489,12.50,7.870,0,0.5240,6.3770,94.30,6.3467,5,311.0,15.20,392.52,20.45} | 15 | 15.2284207930287
-(20 rows)
-</pre>
Note that the results you get for all examples may vary with the database you are using.
@anchor background
diff --git a/src/ports/postgres/modules/crf/crf.sql_in b/src/ports/postgres/modules/crf/crf.sql_in
index f9f6358a..9f12f48f 100644
--- a/src/ports/postgres/modules/crf/crf.sql_in
+++ b/src/ports/postgres/modules/crf/crf.sql_in
@@ -374,60 +374,39 @@ This example uses a trivial training and test data set.
-# Load the label table, the regular expressions table, and the training segment table:
<pre class="example">
-SELECT * FROM crf_label ORDER BY id;
+CREATE TABLE crf_label (id integer,label character varying);
+INSERT INTO crf_label VALUES
+(0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
+(9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
+(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
+(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
+(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
+CREATE TABLE crf_regex (pattern text,name text);
+INSERT INTO crf_regex VALUES
+('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'), ('^.*[0-9]+.*$','containsDigit'),
+('^.+[.]$','endsWithDot'), ('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
+('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'), ('^.+s$','endsWithS'),
+('^.+ing$','endsWithIng'), ('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
+('^.*@.*$','isEmailId');
+CREATE TABLE train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);
+INSERT INTO train_segmenttbl VALUES
+(0,1,'confidence',11,36), (1,1,'in',5,36), (2,1,'the',2,36), (3,1,'pound',11,36),
+(4,1,'is',31,36), (5,1,'widely',19,36), (6,1,'expected',29,36), (7,1,'to',24,36),
+(8,1,'take',26,36), (9,1,'another',2,36), (10,1,'sharp',6,36), (11,1,'dive',11,36),
+(12,1,'if',5,36), (13,1,'trade',11,36), (14,1,'figures',12,36), (15,1,'for',5,36),
+(16,1,'september',13,36), (17,1,',',42,36), (18,1,'due',6,36), (19,1,'for',5,36),
+(20,1,'release',11,36), (21,1,'tomorrow',11,36), (22,1,',',42,36), (23,1,'fail',26,36),
+(24,1,'to',24,36), (25,1,'show',26,36), (26,1,'a',2,36), (27,1,'substantial',6,36),
+(28,1,'improvement',11,36),(29,1,'from',5,36), (30,1,'july',13,36), (31,1,'and',0,36),
+(32,1,'august',13,36), (33,1,'''s',16,36), (34,1,'near-record',6,36),(35,1,'deficits',12,36),
+(36,1,'.',43,36), (0,2,'chancellor',13,26),(1,2,'of',5,26), (2,2,'the',2,26),
+(3,2,'exchequer',13,26), (4,2,'nigel',13,26), (5,2,'lawson',13,26), (6,2,'''s',16,26),
+(7,2,'restated',29,26), (8,2,'commitment',11,26),(9,2,'to',24,26), (10,2,'a',2,26),
+(11,2,'firm',11,26), (12,2,'monetary',6,26), (13,2,'policy',11,26), (14,2,'has',31,26),
+(15,2,'helped',29,26), (16,2,'to',24,26), (17,2,'prevent',26,26), (18,2,'a',2,26),
+(19,2,'freefall',11,26), (20,2,'in',5,26), (21,2,'sterling',11,26), (22,2,'over',5,26),
+(23,2,'the',2,26), (24,2,'past',6,26), (25,2,'week',11,26), (26,2,'.',43,26);
</pre>
-Result:
-<pre class="result">
- id | label
- ---+-------
- 0 | #
- 1 | $
- 2 | ''
-...
- 8 | CC
- 9 | CD
- 10 | DT
- 11 | EX
- 12 | FW
- 13 | IN
- 14 | JJ
-...
-</pre>
-The regular expressions table:
-<pre class="example">
-SELECT * from crf_regex;
-</pre>
-<pre class="result">
- pattern | name
- --------------+----------------------
- ^.+ing$ | endsWithIng
- ^[A-Z][a-z]+$ | InitCapital
- ^[A-Z]+$ | isAllCapital
- ^.*[0-9]+.*$ | containsDigit
-...
-</pre>
-The training segment table:
-<pre class="example">
-SELECT * from train_segmenttbl ORDER BY doc_id, start_pos;
-</pre>
-<pre class="result">
- doc_id | start_pos | seg_text | label
- -------+-----------+------------+-------
- 0 | 0 | Confidence | 18
- 0 | 1 | in | 13
- 0 | 2 | the | 10
- 0 | 3 | pound | 18
- 0 | 4 | is | 38
- 0 | 5 | widely | 26
-...
- 1 | 0 | Chancellor | 19
- 1 | 1 | of | 13
- 1 | 2 | the | 10
- 1 | 3 | Exchequer | 19
- 1 | 4 | Nigel | 19
-...
-</pre>
-
-# Generate the training features:
<pre class="example">
SELECT crf_train_fgen( 'train_segmenttbl',
@@ -441,18 +420,14 @@ SELECT * from crf_dictionary;
</pre>
Result:
<pre class="result">
- token | total
- ----------------+-------
- Hawthorne | 1
- Mercedes-Benzes | 1
- Wolf | 3
- best-known | 1
- hairline | 1
- accepting | 2
- purchases | 14
- trash | 5
- co-venture | 1
- restaurants | 7
+ token | total
+ ------------+-------
+ a | 3
+ and | 1
+ august | 1
+ chancellor | 1
+ dive | 1
+ exchequer | 1
...
</pre>
<pre class="example">
@@ -460,10 +435,10 @@ SELECT * from train_featuretbl;
</pre>
Result:
<pre class="result">
- doc_id | f_size | sparse_r | dense_m | sparse_m
- -------+--------+-------------------------------+---------------------------------+-----------------------
- 2 | 87 | {-1,13,12,0,1,-1,13,9,0,1,..} | {13,31,79,1,1,31,29,70,2,1,...} | {51,26,2,69,29,17,...}
- 1 | 87 | {-1,13,0,0,1,-1,13,9,0,1,...} | {13,0,62,1,1,0,13,54,2,1,13,..} | {51,26,2,69,29,17,...}
+ doc_id | f_size | sparse_r | dense_m | sparse_m
+ -------+--------+------------------------------+----------------------------------+-----------------------
+ 1 | 115 | {-1,11,82,0,1,-1,2,32,0,...} | {11,5,11,1,1,5,2,8,2,1,2,11,...} | {5,2,13,11,11,5,13,...}
+ 2 | 115 | {-1,19,35,0,0,-1,26,38,0,..} | {13,5,66,1,1,5,2,8,2,1,2,13,...} | {5,2,13,11,11,5,13,...}
</pre>
<pre class="example">
SELECT * from train_featureset;
@@ -471,23 +446,18 @@ SELECT * from train_featureset;
<pre class="result">
f_index | f_name | feature
--------+---------------+---------
- 1 | R_endsWithED | {-1,29}
- 13 | W_outweigh | {-1,26}
- 29 | U | {-1,5}
- 31 | U | {-1,29}
- 33 | U | {-1,12}
- 35 | W_a | {-1,2}
- 37 | W_possible | {-1,6}
- 15 | W_signaled | {-1,29}
- 17 | End. | {-1,43}
- 49 | W_'s | {-1,16}
- 63 | W_acquire | {-1,26}
- 51 | E. | {26,2}
- 69 | E. | {29,17}
- 71 | E. | {2,11}
- 83 | W_the | {-1,2}
- 85 | E. | {16,11}
- 4 | W_return | {-1,11}
+ 6 | W_the | {-1,2}
+ 9 | R_endsWithly | {-1,19}
+ 14 | W_figures | {-1,12}
+ 17 | W_helped | {-1,29}
+ 25 | W_show | {-1,26}
+ 28 | W_'s | {-1,16}
+ 33 | W_chancellor | {-1,13}
+ 43 | W_over | {-1,5}
+ 52 | W_trade | {-1,11}
+ 10 | W_july | {-1,13}
+ 21 | W_substantial | {-1,6}
+ 5 | E. | {2,13}
...
</pre>
@@ -513,48 +483,37 @@ SELECT * from crf_weights_tbl;
</pre>
Result:
<pre class="result">
- id | name | prev_label_id | label_id | weight
- ---+---------------+---------------+----------+-------------------
- 1 | R_endsWithED | -1 | 29 | 1.54128249293937
- 13 | W_outweigh | -1 | 26 | 1.70691232223653
- 29 | U | -1 | 5 | 1.40708515869008
- 31 | U | -1 | 29 | 0.830356200936407
- 33 | U | -1 | 12 | 0.769587378281239
- 35 | W_a | -1 | 2 | 2.68470625883726
- 37 | W_possible | -1 | 6 | 3.41773107604468
- 15 | W_signaled | -1 | 29 | 1.68187039165771
- 17 | End. | -1 | 43 | 3.07687845517082
- 49 | W_'s | -1 | 16 | 2.61430312229883
- 63 | W_acquire | -1 | 26 | 1.67247047385797
- 51 | E. | 26 | 2 | 3.0114240119435
- 69 | E. | 29 | 17 | 2.82385531733866
- 71 | E. | 2 | 11 | 3.00970493772732
- 83 | W_the | -1 | 2 | 2.58742315259326
+ id | name | prev_label_id | label_id | weight
+ ----+---------------+---------------+----------+-------------------
+ 4 | W_lawson | -1 | 13 | 1.73698153439171
+ 3 | End. | -1 | 43 | 3.3198742329636
+ 7 | W_has | -1 | 31 | 2.19831004450897
+ 24 | W_tomorrow | -1 | 11 | 3.34106414300743
+ 29 | W_. | -1 | 43 | 3.3198742329636
+ 34 | W_from | -1 | 5 | 2.80284597986744
+ 37 | W_august | -1 | 13 | 1.34455487966976
+ 39 | W_due | -1 | 6 | 3.39258895715363
+ 41 | W_exchequer | -1 | 13 | 1.82177698489335
...
</pre>
-# To find the best labels for a test set using the trained linear CRF model, repeat steps #1-2 and generate the test features, except instead of creating a new dictionary, use the dictionary generated from the training set.
<pre class="example">
-SELECT * from test_segmenttbl ORDER BY doc_id, start_pos;
-</pre>
-Result:
-<pre class="result">
- doc_id | start_pos | seg_text
- -------+-----------+---------------
- 0 | 0 | Rockwell
- 0 | 1 | International
- 0 | 2 | Corp.
- 0 | 3 | 's
- 0 | 4 | Tulsa
- 0 | 5 | unit
- 0 | 6 | said
-...
- 1 | 0 | Rockwell
- 1 | 1 | said
- 1 | 2 | the
- 1 | 3 | agreement
- 1 | 4 | calls
-...
+CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
+INSERT INTO test_segmenttbl VALUES
+(0,1,'chancellor',26),(1,1,'of',26), (2,1,'the',26), (3,1,'exchequer',26), (4,1,'nigel',26),
+(5,1,'lawson',26), (6,1,'''s',26), (7,1,'restated',26), (8,1,'commitment',26),(9,1,'to',26),
+(10,1,'a',26), (11,1,'firm',26), (12,1,'monetary',26),(13,1,'policy',26), (14,1,'has',26),
+(15,1,'helped',26), (16,1,'to',26), (17,1,'prevent',26), (18,1,'a',26), (19,1,'freefall',26),
+(20,1,'in',26), (21,1,'sterling',26),(22,1,'over',26), (23,1,'the',26), (24,1,'past',26),
+(25,1,'week',26), (26,1,'.',26), (0,2,'but',28), (1,2,'analysts',28), (2,2,'reckon',28),
+(3,2,'underlying',28),(4,2,'support',28), (5,2,'for',28), (6,2,'sterling',28), (7,2,'has',28),
+(8,2,'been',28), (9,2,'eroded',28), (10,2,'by',28), (11,2,'the',28), (12,2,'chancellor',28),
+(13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28),
+(18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28),
+(23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28),
+(28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
+(4,3,'.',4);
</pre>
<pre class="example">
SELECT crf_test_fgen( 'test_segmenttbl',
@@ -566,7 +525,6 @@ SELECT crf_test_fgen( 'test_segmenttbl',
'viterbi_rtbl'
);
</pre>
-
-# Calculate the best label sequence and save in the table \c extracted_best_labels.
<pre class="example">
SELECT vcrf_label( 'test_segmenttbl',
@@ -584,20 +542,17 @@ Result:
<pre class="result">
doc_id | start_pos | seg_text | label | id | max_pos | prob
-------+-----------+---------------+-------+----+---------+----------
- 0 | 0 | Rockwell | NNP | 19 | 27 | 0.000269
- 0 | 1 | International | NNP | 19 | 27 | 0.000269
- 0 | 2 | Corp. | NNP | 19 | 27 | 0.000269
- 0 | 3 | 's | NNP | 19 | 27 | 0.000269
+ 1 | 4 | nigel | NNP | 13 | 26 | 0.387118
+ 1 | 5 | lawson | NNP | 13 | 26 | 0.387118
+ 1 | 7 | restated | VBN | 29 | 26 | 0.387118
+ 1 | 8 | commitment | NN | 11 | 26 | 0.387118
...
- 1 | 0 | Rockwell | NNP | 19 | 16 | 0.000168
- 1 | 1 | said | NNP | 19 | 16 | 0.000168
- 1 | 2 | the | DT | 10 | 16 | 0.000168
- 1 | 3 | agreement | JJ | 14 | 16 | 0.000168
- 1 | 4 | calls | NNS | 21 | 16 | 0.000168
+ 3 | 0 | his | NNP | 13 | 4 | 0.047757
+ 3 | 2 | prevent | JJ | 6 | 4 | 0.047757
+ 3 | 4 | . | . | 43 | 4 | 0.047757
...
</pre>
-
@anchor background
@par Technical Background
diff --git a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
index c1aaebf4..011b24e3 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net.sql_in
@@ -491,7 +491,7 @@ SELECT madlib.elastic_net_train( 'houses', -- Source table
'fista', -- Optimizer
'', -- Optimizer parameters
NULL, -- Excluded columns
- 10000, -- Maximum iterations
+ 100, -- Maximum iterations
1e-6 -- Tolerance value
);
</pre>
@@ -507,12 +507,12 @@ Result:
family | gaussian
features | {tax,bath,size}
features_selected | {tax,bath,size}
-coef_nonzero | {22.785201806,10707.9664343,54.7959774173}
-coef_all | {22.785201806,10707.9664343,54.7959774173}
-intercept | -7798.71393905
-log_likelihood | -512248641.971
+coef_nonzero | {22.7898419099,10708.6395642,54.7864827154}
+coef_all | {22.7898419099,10708.6395642,54.7864827154}
+intercept | -7793.63839228
+log_likelihood | -512248647.34
standardize | t
-iteration_run | 10000
+iteration_run | 100
</pre>
-# Use the prediction function to evaluate residuals:
<pre class="example">
@@ -533,33 +533,33 @@ Result:
<pre class="result">
id | price | predict | residual
----+--------+------------------+-------------------
- 1 | 50000 | 58545.391894031 | -8545.391894031
- 2 | 85000 | 114804.077663003 | -29804.077663003
- 3 | 22500 | 61448.835664388 | -38948.835664388
- 4 | 90000 | 104675.17768007 | -14675.17768007
- 5 | 133000 | 125887.70644358 | 7112.29355642
- 6 | 90500 | 78601.843595366 | 11898.156404634
- 7 | 260000 | 199257.358231079 | 60742.641768921
- 8 | 142500 | 82514.559377081 | 59985.440622919
- 9 | 160000 | 137735.93215082 | 22264.06784918
- 10 | 240000 | 250347.627648647 | -10347.627648647
- 11 | 87000 | 97172.428263539 | -10172.428263539
- 12 | 118600 | 119024.150628605 | -424.150628604999
- 13 | 140000 | 180692.127913358 | -40692.127913358
- 14 | 148000 | 156424.249824545 | -8424.249824545
- 15 | 65000 | 102527.938104575 | -37527.938104575
- 16 | 91000 | 102396.67273637 | -11396.67273637
- 17 | 132300 | 123609.20149988 | 8690.79850012
- 18 | 91100 | 74044.833707966 | 17055.166292034
- 19 | 260011 | 196978.853287379 | 63032.146712621
- 20 | 141800 | 84793.064320781 | 57006.935679219
- 21 | 160900 | 139330.88561141 | 21569.11438859
- 22 | 239000 | 248524.823693687 | -9524.82369368701
- 23 | 81010 | 95805.325297319 | -14795.325297319
- 24 | 117910 | 118340.599145495 | -430.599145494998
- 25 | 141100 | 182970.632857058 | -41870.632857058
- 26 | 148011 | 160981.259711945 | -12970.259711945
- 27 | 66000 | 104806.443048275 | -38806.443048275
+ 1 | 50000 | 58546.599589619 | -8546.599589619
+ 2 | 85000 | 114801.915370229 | -29801.915370229
+ 3 | 22500 | 61444.469688442 | -38944.469688442
+ 4 | 90000 | 104673.230727753 | -14673.230727753
+ 5 | 133000 | 125885.956130288 | 7114.04386971201
+ 6 | 90500 | 78606.203576913 | 11893.796423087
+ 7 | 260000 | 199256.827630643 | 60743.172369357
+ 8 | 142500 | 82512.27844767 | 59987.72155233
+ 9 | 160000 | 137736.673923436 | 22263.326076564
+ 10 | 240000 | 250344.545740518 | -10344.545740518
+ 11 | 87000 | 97176.215939216 | -10176.215939216
+ 12 | 118600 | 119026.288024408 | -426.288024408001
+ 13 | 140000 | 180696.360235914 | -40696.360235914
+ 14 | 148000 | 156426.301262683 | -8426.301262683
+ 15 | 65000 | 102523.118132785 | -37523.118132785
+ 16 | 91000 | 102394.246536763 | -11394.246536763
+ 17 | 132300 | 123606.971939298 | 8693.028060702
+ 18 | 91100 | 74048.235194933 | 17051.764805067
+ 19 | 260011 | 196977.843439653 | 63033.156560347
+ 20 | 141800 | 84791.26263866 | 57008.73736134
+ 21 | 160900 | 139331.962857129 | 21568.037142871
+ 22 | 239000 | 248521.358387726 | -9521.35838772598
+ 23 | 81010 | 95808.825424622 | -14798.825424622
+ 24 | 117910 | 118342.592767111 | -432.592767110997
+ 25 | 141100 | 182975.344426904 | -41875.344426904
+ 26 | 148011 | 160984.269644663 | -12973.269644663
+ 27 | 66000 | 104802.102323775 | -38802.102323775
</pre>
<h4>Example with Grouping</h4>
@@ -568,7 +568,7 @@ on zip code:
<pre class="example">
DROP TABLE IF EXISTS houses_en1, houses_en1_summary;
SELECT madlib.elastic_net_train( 'houses', -- Source table
- 'houses_en1', -- Result table
+ 'houses_en1', -- Result table
'price', -- Dependent variable
'array[tax, bath, size]', -- Independent variable
'gaussian', -- Regression family
@@ -579,7 +579,7 @@ SELECT madlib.elastic_net_train( 'houses', -- Source table
'fista', -- Optimizer
'', -- Optimizer parameters
NULL, -- Excluded columns
- 10000, -- Maximum iterations
+ 100, -- Maximum iterations
1e-6 -- Tolerance value
);
</pre>
@@ -596,23 +596,23 @@ zipcode | 94301
family | gaussian
features | {tax,bath,size}
features_selected | {tax,bath,size}
-coef_nonzero | {27.0542096962,12351.5244083,47.5833289771}
-coef_all | {27.0542096962,12351.5244083,47.5833289771}
-intercept | -7191.19791597
-log_likelihood | -519199964.967
+coef_nonzero | {27.6936321338,11508.8932488,49.0964826846}
+coef_all | {27.6936321338,11508.8932488,49.0964826846}
+intercept | -11146.6219839
+log_likelihood | -520356660.297
standardize | t
-iteration_run | 10000
+iteration_run | 100
-[ RECORD 2 ]-----+--------------------------------------------
zipcode | 76010
family | gaussian
features | {tax,bath,size}
features_selected | {tax,bath,size}
-coef_nonzero | {15.6325953499,10166.6608469,57.8689916035}
-coef_all | {15.6325953499,10166.6608469,57.8689916035}
-intercept | 513.912201627
-log_likelihood | -538806528.45
+coef_nonzero | {14.9934758192,9134.7157512,62.796927836}
+coef_all | {14.9934758192,9134.7157512,62.796927836}
+intercept | 27.0655065032
+log_likelihood | -525632815.441
standardize | t
-iteration_run | 10000
+iteration_run | 100
</pre>
-# Use the prediction function to evaluate residuals:
<pre class="example">
@@ -630,7 +630,38 @@ SELECT houses.id,
FROM houses_en1_prediction, houses
WHERE houses.id = houses_en1_prediction.id ORDER BY id;
</pre>
-
+Result:
+<pre class="result">
+ id | price | prediction | residual
+----+--------+------------------+-------------------
+ 1 | 50000 | 54505.805890984 | -4505.80589098399
+ 2 | 85000 | 110175.518839476 | -25175.518839476
+ 3 | 22500 | 52958.415553252 | -30458.415553252
+ 4 | 90000 | 99790.051960086 | -9790.05196008601
+ 5 | 133000 | 122071.482957216 | 10928.517042784
+ 6 | 90500 | 78007.790446902 | 12492.209553098
+ 7 | 260000 | 199466.3529096 | 60533.6470904
+ 8 | 142500 | 76636.825856866 | 65863.174143134
+ 9 | 160000 | 136472.171666792 | 23527.828333208
+ 10 | 240000 | 250762.917456118 | -10762.917456118
+ 11 | 87000 | 96903.077772146 | -9903.077772146
+ 12 | 118600 | 118105.451926206 | 494.548073793994
+ 13 | 140000 | 184131.233653376 | -44131.233653376
+ 14 | 148000 | 156805.424440596 | -8805.42444059599
+ 15 | 65000 | 95307.47866894 | -30307.47866894
+ 16 | 91000 | 111477.479576487 | -20477.4795764872
+ 17 | 132300 | 130783.929262327 | 1516.0707376728
+ 18 | 91100 | 77897.7592753032 | 13202.2407246968
+ 19 | 260011 | 196953.761128831 | 63057.2388711688
+ 20 | 141800 | 94329.0979647992 | 47470.9020352008
+ 21 | 160900 | 141129.427577575 | 19770.5724224248
+ 22 | 239000 | 247476.438620463 | -8476.43862046322
+ 23 | 81010 | 97832.1782395032 | -16822.1782395032
+ 24 | 117910 | 120632.283356431 | -2722.2833564312
+ 25 | 141100 | 176798.212621703 | -35698.2126217032
+ 26 | 148011 | 158801.641015487 | -10790.6410154872
+ 27 | 66000 | 116029.791359903 | -50029.7913599032
+</pre>
@anchor additional_example
<h4>Example where coef_nonzero is different from coef_all</h4>
-# Reuse the <a href="#examples">houses</a> table above and train the model with alpha=1 (L1)
@@ -649,7 +680,7 @@ SELECT madlib.elastic_net_train( 'houses', -- Source table
'fista', -- Optimizer
'', -- Optimizer parameters
NULL, -- Excluded columns
- 10000, -- Maximum iterations
+ 1000, -- Maximum iterations
1e-6 -- Tolerance value
);
</pre>
@@ -665,12 +696,12 @@ Result:
family | gaussian
features | {tax,bath,size}
features_selected | {tax,size}
-coef_nonzero | {6.94744249834,29.7137297658}
-coef_all | {6.94744249834,0,29.7137297658}
-intercept | 74445.7039382
-log_likelihood | -1635348585.07
+coef_nonzero | {6.94502439324,29.7183899065}
+coef_all | {6.94502439324,0,29.7183899065}
+intercept | 74442.858956
+log_likelihood | -1635348583.9
standardize | t
-iteration_run | 151
+iteration_run | 297
</pre>
-# We can still use the prediction function with \e coef_all to evaluate residuals:
<pre class="example">
@@ -709,33 +740,33 @@ The two queries above will result in same residuals:
<pre class="result">
id | price | predict | residual
----+--------+------------------+-------------------
- 1 | 50000 | 101424.266931887 | -51424.2669318866
- 2 | 85000 | 123636.877531235 | -38636.877531235
- 3 | 22500 | 106081.206339915 | -83581.2063399148
- 4 | 90000 | 119117.827607296 | -29117.8276072958
- 5 | 133000 | 128186.922684709 | 4813.0773152912
- 6 | 90500 | 108190.009718915 | -17690.009718915
- 7 | 260000 | 157119.312909723 | 102880.687090277
- 8 | 142500 | 113935.028663057 | 28564.9713369428
- 9 | 160000 | 131799.592783846 | 28200.4072161544
- 10 | 240000 | 182913.598378673 | 57086.4016213268
- 11 | 87000 | 116583.600144218 | -29583.6001442184
- 12 | 118600 | 122842.722992761 | -4242.7229927608
- 13 | 140000 | 148278.940070862 | -8278.94007086201
- 14 | 148000 | 134883.191046754 | 13116.8089532462
- 15 | 65000 | 122046.449722531 | -57046.449722531
- 16 | 91000 | 118423.083357462 | -27423.0833574618
- 17 | 132300 | 127492.178434875 | 4807.8215651252
- 18 | 91100 | 106800.521219247 | -15700.521219247
- 19 | 260011 | 156424.568659889 | 103586.431340111
- 20 | 141800 | 114629.772912891 | 27170.2270871088
- 21 | 160900 | 132285.913758729 | 28614.0862412706
- 22 | 239000 | 182357.802978806 | 56642.197021194
- 23 | 81010 | 116166.753594318 | -35156.753594318
- 24 | 117910 | 122634.299717811 | -4724.29971781059
- 25 | 141100 | 148973.684320696 | -7873.68432069599
- 26 | 148011 | 136272.679546422 | 11738.3204535782
- 27 | 66000 | 122741.193972365 | -56741.193972365
+ 1 | 50000 | 101423.583576017 | -51423.5835760166
+ 2 | 85000 | 123638.064337067 | -38638.064337067
+ 3 | 22500 | 106083.252744755 | -83583.2527447548
+ 4 | 90000 | 119118.937056569 | -29118.9370565688
+ 5 | 133000 | 128187.876014827 | 4812.12398517321
+ 6 | 90500 | 108187.721610204 | -17687.721610204
+ 7 | 260000 | 157119.647513985 | 102880.352486015
+ 8 | 142500 | 113935.991734008 | 28564.0082659918
+ 9 | 160000 | 131799.288699312 | 28200.7113006884
+ 10 | 240000 | 182914.856562258 | 57085.1434377418
+ 11 | 87000 | 116581.541052473 | -29581.5410524734
+ 12 | 118600 | 122841.785856174 | -4241.7858561738
+ 13 | 140000 | 148276.800810484 | -8276.80081048398
+ 14 | 148000 | 134882.563805082 | 13117.4361949182
+ 15 | 65000 | 122048.790176031 | -57048.790176031
+ 16 | 91000 | 118424.434617245 | -27424.4346172448
+ 17 | 132300 | 127493.373575503 | 4806.6264244972
+ 18 | 91100 | 106798.716731556 | -15698.716731556
+ 19 | 260011 | 156425.145074661 | 103585.854925339
+ 20 | 141800 | 114630.494173332 | 27169.5058266678
+ 21 | 160900 | 132285.440406838 | 28614.5595931616
+ 22 | 239000 | 182359.254610799 | 56640.745389201
+ 23 | 81010 | 116164.839588879 | -35154.839588879
+ 24 | 117910 | 122633.435124377 | -4723.4351243766
+ 25 | 141100 | 148971.303249808 | -7871.303249808
+ 26 | 148011 | 136271.56868373 | 11739.4313162702
+ 27 | 66000 | 122743.292615355 | -56743.292615355
(27 rows)
</pre>
@@ -748,7 +779,7 @@ these combinations, then a 28th time for the whole dataset.)
<pre class="example">
DROP TABLE IF EXISTS houses_en3, houses_en3_summary, houses_en3_cv;
SELECT madlib.elastic_net_train( 'houses', -- Source table
- 'houses_en3', -- Result table
+ 'houses_en3', -- Result table
'price', -- Dependent variable
'array[tax, bath, size]', -- Independent variable
'gaussian', -- Regression family
@@ -763,9 +794,10 @@ SELECT madlib.elastic_net_train( 'houses', -- Source table
alpha = {0, 0.1, 1}
$$,
NULL, -- Excluded columns
- 10000, -- Maximum iterations
+ 200, -- Maximum iterations
1e-6 -- Tolerance value
);
+\\x on
SELECT * FROM houses_en3;
</pre>
<pre class="result">
@@ -773,30 +805,31 @@ SELECT * FROM houses_en3;
family | gaussian
features | {tax,bath,size}
features_selected | {tax,bath,size}
-coef_nonzero | {22.4584188479,11657.0739045,52.1624090811}
-coef_all | {22.4584188479,11657.0739045,52.1624090811}
-intercept | -5067.33396522
+coef_nonzero | {22.4584875792,11657.0840746,52.1622709646}
+coef_all | {22.4584875792,11657.0840746,52.1622709646}
+intercept | -5067.2628524
log_likelihood | -543193170.15
standardize | t
-iteration_run | 10000
+iteration_run | 200
</pre>
-# Details of the cross validation:
<pre class="example">
+\\x off
SELECT * FROM houses_en3_cv ORDER BY mean_neg_loss DESC;
</pre>
<pre class="result">
- alpha | lambda_value | mean_neg_loss | std_neg_loss
--------+--------------+------------------------------------------+
- 0.0 | 0.1 | -36094.4685768 | 10524.4473253
- 0.1 | 0.1 | -36136.2448004 | 10682.4136993
- 1.0 | 100.0 | -37007.9496501 | 12679.3781975
- 1.0 | 0.1 | -37018.1019927 | 12716.7438015
- 0.1 | 100.0 | -59275.6940173 | 9764.50064237
- 0.0 | 100.0 | -59380.252681 | 9763.26373034
- 1.0 | 100000.0 | -60353.0220769 | 9748.10305107
- 0.1 | 100000.0 | {large neg number} | {large pos number}
- 0.0 | 100000.0 | {large neg number} | {large pos number}
+ alpha | lambda_value | mean_score | std_dev_score
+-------+--------------+--------------------+--------------------
+ 0.0 | 0.1 | -36964.1349749 | 7006.24916542
+ 0.1 | 0.1 | -37033.6458432 | 7094.45992609
+ 1.0 | 100.0 | -38060.4537864 | 7891.42815774
+ 1.0 | 0.1 | -38097.4491274 | 7957.27212821
+ 0.1 | 100.0 | -58955.454086 | 12733.6947097
+ 0.0 | 100.0 | -59062.3214246 | 12731.3011318
+ 1.0 | 100000.0 | -60055.6624133 | 12708.5131797
+ 0.1 | 100000.0 | {large neg number} | {large pos number}
+ 0.0 | 100000.0 | {large neg number} | {large pos number}
(9 rows)
</pre>
@@ -880,7 +913,7 @@ File elastic_net.sql_in documenting the SQL functions.
* column names as 'col1, col2, ...' if col_ind_var is '*';
* element indices as '1,2,3, ...' if col_ind_var is a column of array)
* @param max_iter Maximum number of iterations to run the algorithm
- * (default value of 10000)
+ * (default value of 1000)
* @param tolerance Iteration stopping criteria. Default is 1e-6
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.elastic_net_train (
diff --git a/src/ports/postgres/modules/glm/glm.sql_in b/src/ports/postgres/modules/glm/glm.sql_in
index 377dfea5..3e4aa442 100644
--- a/src/ports/postgres/modules/glm/glm.sql_in
+++ b/src/ports/postgres/modules/glm/glm.sql_in
@@ -366,6 +366,7 @@ SELECT create_indicator_variables('warpbreaks', 'warpbreaks_dummy', 'wool,tensio
-# Train a GLM model.
<pre class="example">
+DROP TABLE IF EXISTS glm_model, glm_model_summary;
SELECT glm('warpbreaks_dummy',
'glm_model',
'breaks',
@@ -510,6 +511,7 @@ INSERT INTO abalone VALUES
-# Train a model with family=gaussian and link=identity
<pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_summary;
SELECT madlib.glm(
'abalone',
'abalone_out',
@@ -525,6 +527,7 @@ SELECT madlib.glm(
-# Train a model with family=gamma and link=inverse
<pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_summary;
SELECT madlib.glm(
'abalone',
'abalone_out',
@@ -540,6 +543,7 @@ SELECT madlib.glm(
-# Train a model with family=inverse_gaussian and link=sqr_inverse
<pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_summary;
SELECT madlib.glm(
'abalone',
'abalone_out',
@@ -555,6 +559,7 @@ SELECT madlib.glm(
-# Train a model with family=binomial and link=probit
<pre class="example">
+DROP TABLE IF EXISTS abalone_out, abalone_out_summary;
SELECT madlib.glm(
'abalone',
'abalone_out',
diff --git a/src/ports/postgres/modules/glm/multiresponseglm.sql_in b/src/ports/postgres/modules/glm/multiresponseglm.sql_in
index c810305c..5fa02251 100644
--- a/src/ports/postgres/modules/glm/multiresponseglm.sql_in
+++ b/src/ports/postgres/modules/glm/multiresponseglm.sql_in
@@ -369,7 +369,6 @@ SELECT madlib.multinom('test3',
\\x on
SELECT * FROM test3_output;
</pre>
-
Result:
<pre class="result">
-[ RECORD 1 ]------+------------------------------------------------------------
@@ -393,18 +392,16 @@ num_rows_processed | 57
num_rows_skipped | 0
iteration | 6
</pre>
-
-# Predicting dependent variable using multinomial model.
(This example uses the original data table to perform the prediction. Typically
a different test dataset with the same features as the original training dataset
would be used for prediction.)
-
<pre class="example">
\\x off
-- Add the id column for prediction function
ALTER TABLE test3 ADD COLUMN id SERIAL;
-- Predict probabilities for all categories using the original data
-SELECT madlib.multinom_predict('test3_out','test3', 'test3_prd_prob', 'probability');
+SELECT madlib.multinom_predict('test3_output','test3', 'test3_prd_prob', 'probability');
-- Display the predicted value
SELECT * FROM test3_prd_prob;
</pre>
diff --git a/src/ports/postgres/modules/glm/ordinal.sql_in b/src/ports/postgres/modules/glm/ordinal.sql_in
index 7cda2941..7d1d7d5c 100644
--- a/src/ports/postgres/modules/glm/ordinal.sql_in
+++ b/src/ports/postgres/modules/glm/ordinal.sql_in
@@ -394,7 +394,6 @@ SELECT madlib.ordinal('test3',
\\x on
SELECT * FROM test3_output;
</pre>
-
Result:
<pre class="result">
-[ RECORD 1 ]------+-------------------------------------------
@@ -416,7 +415,6 @@ iteration | 7
(This example uses the original data table to perform the prediction. Typically
a different test dataset with the same features as the original training dataset
would be used for prediction.)
-
<pre class="example">
\\x off
-- Add the id column for prediction function
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 04f7b82a..de2e4d6d 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -1024,7 +1024,7 @@ SELECT madlib.tree_train('dt_golf', -- source table
1, -- min bucket
10 -- number of bins per continuous variable
);
-SELECT madlib.tree_display('train_output');
+SELECT madlib.tree_display('train_output', FALSE);
</pre>
<pre class="result">
-------------------------------------
@@ -1327,6 +1327,7 @@ View the output table (excluding the tree which is in binary format).
The input cp value was 0 (default) and the best 'pruning_cp' value
turns out to be 0 as well in this small example:
<pre class="example">
+\\x on
SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, tree_depth FROM train_output;
</pre>
<pre class="result">
@@ -1339,6 +1340,7 @@ tree_depth | 4
</pre>
The cp values tested and average error and standard deviation are:
<pre class="example">
+\\x off
SELECT * FROM train_output_cv ORDER BY cv_error_avg ASC;
</pre>
<pre class="result">
diff --git a/src/ports/postgres/modules/regress/clustered_variance.sql_in b/src/ports/postgres/modules/regress/clustered_variance.sql_in
index f05630de..89143f06 100644
--- a/src/ports/postgres/modules/regress/clustered_variance.sql_in
+++ b/src/ports/postgres/modules/regress/clustered_variance.sql_in
@@ -284,27 +284,103 @@ clustered_variance_coxph(model_table, output_table, clustervar)
@anchor examples
@par Examples
--# View online help for the clustered variance linear regression function.
+-# Create a testing data table
<pre class="example">
-SELECT madlib.clustered_variance_linregr();
+CREATE TABLE abalone (
+ id integer,
+ sex text,
+ length double precision,
+ diameter double precision,
+ height double precision,
+ whole double precision,
+ shucked double precision,
+ viscera double precision,
+ shell double precision,
+ rings integer
+);
+INSERT INTO abalone VALUES
+(3151, 'F', 0.655000000000000027, 0.505000000000000004, 0.165000000000000008, 1.36699999999999999, 0.583500000000000019, 0.351499999999999979, 0.396000000000000019, 10),
+(2026, 'F', 0.550000000000000044, 0.469999999999999973, 0.149999999999999994, 0.920499999999999985, 0.381000000000000005, 0.243499999999999994, 0.267500000000000016, 10),
+(3751, 'I', 0.434999999999999998, 0.375, 0.110000000000000001, 0.41549999999999998, 0.170000000000000012, 0.0759999999999999981, 0.14499999999999999, 8),
+(720, 'I', 0.149999999999999994, 0.100000000000000006, 0.0250000000000000014, 0.0149999999999999994, 0.00449999999999999966, 0.00400000000000000008, 0.0050000000000000001, 2),
+(1635, 'F', 0.574999999999999956, 0.469999999999999973, 0.154999999999999999, 1.1160000000000001, 0.509000000000000008, 0.237999999999999989, 0.340000000000000024, 10),
+(2648, 'I', 0.5, 0.390000000000000013, 0.125, 0.582999999999999963, 0.293999999999999984, 0.132000000000000006, 0.160500000000000004, 8),
+(1796, 'F', 0.57999999999999996, 0.429999999999999993, 0.170000000000000012, 1.47999999999999998, 0.65349999999999997, 0.32400000000000001, 0.41549999999999998, 10),
+(209, 'F', 0.525000000000000022, 0.41499999999999998, 0.170000000000000012, 0.832500000000000018, 0.275500000000000023, 0.168500000000000011, 0.309999999999999998, 13),
+(1451, 'I', 0.455000000000000016, 0.33500000000000002, 0.135000000000000009, 0.501000000000000001, 0.274000000000000021, 0.0995000000000000051, 0.106499999999999997, 7),
+(1108, 'I', 0.510000000000000009, 0.380000000000000004, 0.115000000000000005, 0.515499999999999958, 0.214999999999999997, 0.113500000000000004, 0.166000000000000009, 8),
+(3675, 'F', 0.594999999999999973, 0.450000000000000011, 0.165000000000000008, 1.08099999999999996, 0.489999999999999991, 0.252500000000000002, 0.279000000000000026, 12),
+(2108, 'F', 0.675000000000000044, 0.550000000000000044, 0.179999999999999993, 1.68849999999999989, 0.562000000000000055, 0.370499999999999996, 0.599999999999999978, 15),
+(3312, 'F', 0.479999999999999982, 0.380000000000000004, 0.135000000000000009, 0.507000000000000006, 0.191500000000000004, 0.13650000000000001, 0.154999999999999999, 12),
+(882, 'M', 0.655000000000000027, 0.520000000000000018, 0.165000000000000008, 1.40949999999999998, 0.585999999999999965, 0.290999999999999981, 0.405000000000000027, 9),
+(3402, 'M', 0.479999999999999982, 0.395000000000000018, 0.149999999999999994, 0.681499999999999995, 0.214499999999999996, 0.140500000000000014, 0.2495, 18),
+(829, 'I', 0.409999999999999976, 0.325000000000000011, 0.100000000000000006, 0.394000000000000017, 0.20799999999999999, 0.0655000000000000027, 0.105999999999999997, 6),
+(1305, 'M', 0.535000000000000031, 0.434999999999999998, 0.149999999999999994, 0.716999999999999971, 0.347499999999999976, 0.14449999999999999, 0.194000000000000006, 9),
+(3613, 'M', 0.599999999999999978, 0.46000000000000002, 0.179999999999999993, 1.1399999999999999, 0.422999999999999987, 0.257500000000000007, 0.364999999999999991, 10),
+(1068, 'I', 0.340000000000000024, 0.265000000000000013, 0.0800000000000000017, 0.201500000000000012, 0.0899999999999999967, 0.0475000000000000006, 0.0550000000000000003, 5),
+(2446, 'M', 0.5, 0.380000000000000004, 0.135000000000000009, 0.583500000000000019, 0.22950000000000001, 0.126500000000000001, 0.179999999999999993, 12),
+(1393, 'M', 0.635000000000000009, 0.474999999999999978, 0.170000000000000012, 1.19350000000000001, 0.520499999999999963, 0.269500000000000017, 0.366499999999999992, 10),
+(359, 'M', 0.744999999999999996, 0.584999999999999964, 0.214999999999999997, 2.49900000000000011, 0.92649999999999999, 0.471999999999999975, 0.699999999999999956, 17),
+(549, 'F', 0.564999999999999947, 0.450000000000000011, 0.160000000000000003, 0.79500000000000004, 0.360499999999999987, 0.155499999999999999, 0.23000000000000001, 12),
+(1154, 'F', 0.599999999999999978, 0.474999999999999978, 0.160000000000000003, 1.02649999999999997, 0.484999999999999987, 0.2495, 0.256500000000000006, 9),
+(1790, 'F', 0.54500000000000004, 0.385000000000000009, 0.149999999999999994, 1.11850000000000005, 0.542499999999999982, 0.244499999999999995, 0.284499999999999975, 9),
+(3703, 'F', 0.665000000000000036, 0.540000000000000036, 0.195000000000000007, 1.76400000000000001, 0.850500000000000034, 0.361499999999999988, 0.469999999999999973, 11),
+(1962, 'F', 0.655000000000000027, 0.515000000000000013, 0.179999999999999993, 1.41199999999999992, 0.619500000000000051, 0.248499999999999999, 0.496999999999999997, 11),
+(1665, 'I', 0.604999999999999982, 0.469999999999999973, 0.14499999999999999, 0.802499999999999991, 0.379000000000000004, 0.226500000000000007, 0.220000000000000001, 9),
+(635, 'M', 0.359999999999999987, 0.294999999999999984, 0.100000000000000006, 0.210499999999999993, 0.0660000000000000031, 0.0524999999999999981, 0.0749999999999999972, 9),
+(3901, 'M', 0.445000000000000007, 0.344999999999999973, 0.140000000000000013, 0.475999999999999979, 0.205499999999999988, 0.101500000000000007, 0.108499999999999999, 15),
+(2734, 'I', 0.41499999999999998, 0.33500000000000002, 0.100000000000000006, 0.357999999999999985, 0.169000000000000011, 0.067000000000000004, 0.104999999999999996, 7),
+(3856, 'M', 0.409999999999999976, 0.33500000000000002, 0.115000000000000005, 0.440500000000000003, 0.190000000000000002, 0.0850000000000000061, 0.135000000000000009, 8),
+(827, 'I', 0.395000000000000018, 0.28999999999999998, 0.0950000000000000011, 0.303999999999999992, 0.127000000000000002, 0.0840000000000000052, 0.076999999999999999, 6),
+(3381, 'I', 0.190000000000000002, 0.130000000000000004, 0.0449999999999999983, 0.0264999999999999993, 0.00899999999999999932, 0.0050000000000000001, 0.00899999999999999932, 5),
+(3972, 'I', 0.400000000000000022, 0.294999999999999984, 0.0950000000000000011, 0.252000000000000002, 0.110500000000000001, 0.0575000000000000025, 0.0660000000000000031, 6),
+(1155, 'M', 0.599999999999999978, 0.455000000000000016, 0.170000000000000012, 1.1915, 0.695999999999999952, 0.239499999999999991, 0.239999999999999991, 8),
+(3467, 'M', 0.640000000000000013, 0.5, 0.170000000000000012, 1.4544999999999999, 0.642000000000000015, 0.357499999999999984, 0.353999999999999981, 9),
+(2433, 'F', 0.609999999999999987, 0.484999999999999987, 0.165000000000000008, 1.08699999999999997, 0.425499999999999989, 0.232000000000000012, 0.380000000000000004, 11),
+(552, 'I', 0.614999999999999991, 0.489999999999999991, 0.154999999999999999, 0.988500000000000045, 0.41449999999999998, 0.195000000000000007, 0.344999999999999973, 13),
+(1425, 'F', 0.729999999999999982, 0.57999999999999996, 0.190000000000000002, 1.73750000000000004, 0.678499999999999992, 0.434499999999999997, 0.520000000000000018, 11),
+(2402, 'F', 0.584999999999999964, 0.41499999999999998, 0.154999999999999999, 0.69850000000000001, 0.299999999999999989, 0.145999999999999991, 0.195000000000000007, 12),
+(1748, 'F', 0.699999999999999956, 0.535000000000000031, 0.174999999999999989, 1.77299999999999991, 0.680499999999999994, 0.479999999999999982, 0.512000000000000011, 15),
+(3983, 'I', 0.57999999999999996, 0.434999999999999998, 0.149999999999999994, 0.891499999999999959, 0.362999999999999989, 0.192500000000000004, 0.251500000000000001, 6),
+(335, 'F', 0.739999999999999991, 0.599999999999999978, 0.195000000000000007, 1.97399999999999998, 0.597999999999999976, 0.408499999999999974, 0.709999999999999964, 16),
+(1587, 'I', 0.515000000000000013, 0.349999999999999978, 0.104999999999999996, 0.474499999999999977, 0.212999999999999995, 0.122999999999999998, 0.127500000000000002, 10),
+(2448, 'I', 0.275000000000000022, 0.204999999999999988, 0.0800000000000000017, 0.096000000000000002, 0.0359999999999999973, 0.0184999999999999991, 0.0299999999999999989, 6),
+(1362, 'F', 0.604999999999999982, 0.474999999999999978, 0.174999999999999989, 1.07600000000000007, 0.463000000000000023, 0.219500000000000001, 0.33500000000000002, 9),
+(2799, 'M', 0.640000000000000013, 0.484999999999999987, 0.149999999999999994, 1.09800000000000009, 0.519499999999999962, 0.222000000000000003, 0.317500000000000004, 10),
+(1413, 'F', 0.67000000000000004, 0.505000000000000004, 0.174999999999999989, 1.01449999999999996, 0.4375, 0.271000000000000019, 0.3745, 10),
+(1739, 'F', 0.67000000000000004, 0.540000000000000036, 0.195000000000000007, 1.61899999999999999, 0.739999999999999991, 0.330500000000000016, 0.465000000000000024, 11),
+(1152, 'M', 0.584999999999999964, 0.465000000000000024, 0.160000000000000003, 0.955500000000000016, 0.45950000000000002, 0.235999999999999988, 0.265000000000000013, 7),
+(2427, 'I', 0.564999999999999947, 0.434999999999999998, 0.154999999999999999, 0.782000000000000028, 0.271500000000000019, 0.16800000000000001, 0.284999999999999976, 14),
+(1777, 'M', 0.484999999999999987, 0.369999999999999996, 0.154999999999999999, 0.967999999999999972, 0.418999999999999984, 0.245499999999999996, 0.236499999999999988, 9),
+(3294, 'M', 0.574999999999999956, 0.455000000000000016, 0.184999999999999998, 1.15599999999999992, 0.552499999999999991, 0.242999999999999994, 0.294999999999999984, 13),
+(1403, 'M', 0.650000000000000022, 0.510000000000000009, 0.190000000000000002, 1.54200000000000004, 0.715500000000000025, 0.373499999999999999, 0.375, 9),
+(2256, 'M', 0.510000000000000009, 0.395000000000000018, 0.14499999999999999, 0.61850000000000005, 0.215999999999999998, 0.138500000000000012, 0.239999999999999991, 12),
+(3984, 'F', 0.584999999999999964, 0.450000000000000011, 0.125, 0.873999999999999999, 0.354499999999999982, 0.20749999999999999, 0.225000000000000006, 6),
+(1116, 'M', 0.525000000000000022, 0.405000000000000027, 0.119999999999999996, 0.755499999999999949, 0.3755, 0.155499999999999999, 0.201000000000000012, 9),
+(1366, 'M', 0.609999999999999987, 0.474999999999999978, 0.170000000000000012, 1.02649999999999997, 0.434999999999999998, 0.233500000000000013, 0.303499999999999992, 10),
+(3759, 'I', 0.525000000000000022, 0.400000000000000022, 0.140000000000000013, 0.605500000000000038, 0.260500000000000009, 0.107999999999999999, 0.209999999999999992, 9);
</pre>
-# Run the linear regression function and view the results.
<pre class="example">
+\\x on
DROP TABLE IF EXISTS out_table, out_table_summary;
SELECT madlib.clustered_variance_linregr( 'abalone',
'out_table',
'rings',
- 'ARRAY[1, diameter, length, width]',
+ 'ARRAY[1, diameter, length, height]',
'sex',
NULL
);
SELECT * FROM out_table;
</pre>
-
--# View online help for the clustered variance logistic regression function.
-<pre class="example">
-SELECT madlib.clustered_variance_logregr();
+Result:
+<pre class="result">
+-[ RECORD 1 ]-----------------------------------------------------------------------
+coef | {2.53526184512177,14.1959262629025,-17.4142205261305,73.9536825412142}
+std_err | {2.08204036310278,10.1218601277935,16.350795118006,17.7971852600971}
+t_stats | {1.21768141004893,1.40250172237829,-1.06503814649071,4.15535835922465}
+p_values | {0.22845116414893,0.166285056923658,0.2914293364465,0.000112184340238519}
</pre>
-# Run the logistic regression function and view the results.
@@ -313,15 +389,18 @@ DROP TABLE IF EXISTS out_table, out_table_summary;
SELECT madlib.clustered_variance_logregr( 'abalone',
'out_table',
'rings < 10',
- 'ARRAY[1, diameter, length, width]',
+ 'ARRAY[1, diameter, length, height]',
'sex'
);
SELECT * FROM out_table;
</pre>
-
--# View online help for the clustered variance multinomial logistic regression function.
-<pre class="example">
-SELECT madlib.clustered_variance_mlogregr();
+Result:
+<pre class="result">
+-[ RECORD 1 ]----------------------------------------------------------------------------
+coef | {7.03525620439852,5.16355730320515,-4.03125518391448,-47.5439002903374}
+std_err | {2.69860857119167,21.4303882155136,16.6528594816461,5.89094595954187}
+z_stats | {2.60699394476904,0.240945579299736,-0.242075854201348,-8.0706733038907}
+p_values | {0.00913409755638422,0.809597295390548,0.808721387408619,6.99115526001629e-16}
</pre>
-# Run the multinomial logistic regression and view the results.
@@ -330,31 +409,84 @@ DROP TABLE IF EXISTS out_table, out_table_summary;
SELECT madlib.clustered_variance_mlogregr( 'abalone',
'out_table',
'CASE WHEN rings < 10 THEN 1 ELSE 0 END',
- 'ARRAY[1, diameter, length, width]',
+ 'ARRAY[1, diameter, length, height]',
'sex',
0
);
SELECT * FROM out_table;
</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]+-------------------------------------------------------------------------------
+category | 1
+ref_category | 0
+coef | {7.03525620439846,5.16355730320138,-4.03125518391122,-47.5439002903385}
+std_err | {2.69860857119169,21.4303882155156,16.6528594816446,5.89094595954797}
+z_stats | {2.606993944769,0.240945579299537,-0.242075854201173,-8.07067330388254}
+p_values | {0.00913409755638535,0.809597295390702,0.808721387408754,6.99115526048361e-16}
+</pre>
-# Run the Cox Proportional Hazards regression and compute the clustered robust estimator.
<pre class="example">
-DROP TABLE IF EXISTS lung_cl_out;
-DROP TABLE IF EXISTS lung_out;
-DROP TABLE IF EXISTS lung_out_summary;
-SELECT madlib.coxph_train('lung',
- 'lung_out',
- 'time',
- 'array[age, "ph.ecog"]',
- 'TRUE',
- NULL,
- NULL);
-SELECT madlib.clustered_variance_coxph('lung_out',
- 'lung_cl_out',
- '"ph.karno"');
-SELECT * FROM lung_cl_out;
+DROP TABLE IF EXISTS sample_data;
+CREATE TABLE sample_data (
+ id INTEGER NOT NULL,
+ grp DOUBLE PRECISION,
+ wbc DOUBLE PRECISION,
+ timedeath INTEGER,
+ status BOOLEAN,
+ sex TEXT
+);
+COPY sample_data FROM STDIN WITH DELIMITER '|';
+ 0 | 0 | 1.45 | 35 | t | 'M'
+ 1 | 0 | 1.47 | 34 | t | 'M'
+ 3 | 0 | 2.2 | 32 | t | 'M'
+ 4 | 0 | 1.78 | 25 | t | 'M'
+ 5 | 0 | 2.57 | 23 | t | 'M'
+ 6 | 0 | 2.32 | 22 | t | 'M'
+ 7 | 0 | 2.01 | 20 | t | 'M'
+ 8 | 0 | 2.05 | 19 | t | 'M'
+ 9 | 0 | 2.16 | 17 | t | 'M'
+ 10 | 0 | 3.6 | 16 | t | 'M'
+ 11 | 1 | 2.3 | 15 | t | 'M'
+ 12 | 0 | 2.88 | 13 | t | 'I'
+ 13 | 1 | 1.5 | 12 | t | 'I'
+ 14 | 0 | 2.6 | 11 | t | 'I'
+ 15 | 0 | 2.7 | 10 | t | 'I'
+ 16 | 0 | 2.8 | 9 | t | 'I'
+ 17 | 1 | 2.32 | 8 | t | 'F'
+ 18 | 0 | 4.43 | 7 | t | 'F'
+ 19 | 0 | 2.31 | 6 | t | 'F'
+ 20 | 1 | 3.49 | 5 | t | 'F'
+ 21 | 1 | 2.42 | 4 | t | 'F'
+ 22 | 1 | 4.01 | 3 | t | 'F'
+ 23 | 1 | 4.91 | 2 | t | 'F'
+ 24 | 1 | 5 | 1 | t | 'F'
+\\.
+DROP TABLE IF EXISTS sample_cox, sample_cox_summary, sample_cox_cl;
+SELECT madlib.coxph_train( 'sample_data',
+ 'sample_cox',
+ 'timedeath',
+ 'ARRAY[grp,wbc]',
+ 'status'
+ );
+SELECT madlib.clustered_variance_coxph('sample_cox',
+ 'sample_cox_cl',
+ 'sex');
+SELECT * FROM sample_cox_cl;
+</pre>
+Result:
+<pre class="result">
+-[ RECORD 1 ]-+----------------------------------------------------------------------------
+coef | {2.54407073265254,1.67172094779487}
+loglikelihood | -37.8532498733
+std_err | {0.677180599295183,0.387195514577697}
+clustervar | sex
+clustered_se | {0.545274710867954,0.228046806400425}
+clustered_z | {4.6656679320465,7.33060451133666}
+clustered_p | {3.07616143241047e-06,2.29116873819977e-13}
+hessian | {{2.78043065745617,-2.25848560642761},{-2.25848560642761,8.50472838284472}}
</pre>
-
@anchor notes
@par Notes
diff --git a/src/ports/postgres/modules/regress/marginal.sql_in b/src/ports/postgres/modules/regress/marginal.sql_in
index 3cb3f8a2..b7188175 100644
--- a/src/ports/postgres/modules/regress/marginal.sql_in
+++ b/src/ports/postgres/modules/regress/marginal.sql_in
@@ -276,39 +276,34 @@ margins_mlogregr( source_table,
@anchor examples
@examp
--# View online help for the marginal effects function.
-<pre class="example">
-SELECT madlib.margins();
-</pre>
-
-# Create the sample data set. Use the <tt>patients</tt> dataset from the <a href="group__grp__logreg.html#examples">Logistic Regression examples</a>.
<pre class="example">
-SELECT * FROM patients;
-</pre>
-Result:
-<pre class="result">
- id | second_attack | treatment | trait_anxiety
- ---+---------------+-----------+---------------
- 1 | 1 | 1 | 70
- 3 | 1 | 1 | 50
- 5 | 1 | 0 | 40
- 7 | 1 | 0 | 75
- 9 | 1 | 0 | 70
- 11 | 0 | 1 | 65
- 13 | 0 | 1 | 45
- 15 | 0 | 1 | 40
- 17 | 0 | 0 | 55
- 19 | 0 | 0 | 50
- 2 | 1 | 1 | 80
- 4 | 1 | 0 | 60
- 6 | 1 | 0 | 65
- 8 | 1 | 0 | 80
- 10 | 1 | 0 | 60
- 12 | 0 | 1 | 50
- 14 | 0 | 1 | 35
- 16 | 0 | 1 | 50
- 18 | 0 | 0 | 45
- 20 | 0 | 0 | 60
+DROP TABLE IF EXISTS patients;
+CREATE TABLE patients( id INTEGER NOT NULL,
+ second_attack INTEGER,
+ treatment INTEGER,
+ trait_anxiety INTEGER);
+INSERT INTO patients VALUES
+(1, 1, 1, 70),
+(2, 1, 1, 80),
+(3, 1, 1, 50),
+(4, 1, 0, 60),
+(5, 1, 0, 40),
+(6, 1, 0, 65),
+(7, 1, 0, 75),
+(8, 1, 0, 80),
+(9, 1, 0, 70),
+(10, 1, 0, 60),
+(11, 0, 1, 65),
+(12, 0, 1, 50),
+(13, 0, 1, 45),
+(14, 0, 1, 35),
+(15, 0, 1, 40),
+(16, 0, 1, 50),
+(17, 0, 0, 55),
+(18, 0, 0, 45),
+(19, 0, 0, 50),
+(20, 0, 0, 60);
</pre>
-# Run logistic regression to get the model, compute the marginal effects of all variables, and view the results.
@@ -365,28 +360,70 @@ p_values | {0.629544793219806}
Use the dataset from the <a href="group__grp__mlogreg.html#examples">Multinomial Regression example</a>.
<pre class="example">
\\x OFF
-SELECT * FROM test3;
-</pre>
-Result:
-<pre class="result">
- feat1 | feat2 | cat
--------+-------+-----
- 2 | 33 | 0
- 2 | 31 | 1
- 2 | 36 | 1
- 2 | 31 | 1
- 2 | 41 | 1
- 2 | 37 | 1
- 2 | 44 | 1
- 2 | 46 | 1
- 2 | 46 | 2
- 2 | 39 | 0
- 2 | 44 | 1
- 2 | 44 | 0
- 2 | 67 | 2
- 2 | 59 | 2
- 2 | 59 | 0
-...
+DROP TABLE IF EXISTS test3;
+CREATE TABLE test3 (
+ feat1 INTEGER,
+ feat2 INTEGER,
+ cat INTEGER
+);
+INSERT INTO test3(feat1, feat2, cat) VALUES
+(1,35,1),
+(2,33,0),
+(3,39,1),
+(1,37,1),
+(2,31,1),
+(3,36,0),
+(2,36,1),
+(2,31,1),
+(2,41,1),
+(2,37,1),
+(1,44,1),
+(3,33,2),
+(1,31,1),
+(2,44,1),
+(1,35,1),
+(1,44,0),
+(1,46,0),
+(2,46,1),
+(2,46,2),
+(3,49,1),
+(2,39,0),
+(2,44,1),
+(1,47,1),
+(1,44,1),
+(1,37,2),
+(3,38,2),
+(1,49,0),
+(2,44,0),
+(3,61,2),
+(1,65,2),
+(3,67,1),
+(3,65,2),
+(1,65,2),
+(2,67,2),
+(1,65,2),
+(1,62,2),
+(3,52,2),
+(3,63,2),
+(2,59,2),
+(3,65,2),
+(2,59,0),
+(3,67,2),
+(3,67,2),
+(3,60,2),
+(3,67,2),
+(3,62,2),
+(2,54,2),
+(3,65,2),
+(3,62,2),
+(2,59,2),
+(3,60,2),
+(3,63,2),
+(3,65,2),
+(2,63,1),
+(2,67,2),
+(2,65,2),
+(2,62,2);
</pre>
-# Run the regression function and then compute the marginal effects of all variables in the regression.
diff --git a/src/ports/postgres/modules/regress/robust.sql_in b/src/ports/postgres/modules/regress/robust.sql_in
index 7964eb00..ecafd915 100644
--- a/src/ports/postgres/modules/regress/robust.sql_in
+++ b/src/ports/postgres/modules/regress/robust.sql_in
@@ -325,7 +325,7 @@ COPY patients FROM STDIN WITH DELIMITER '|';
</pre>
-# Run the logistic regression training function and compute the robust logistic variance of the regression:
<pre class="example">
-DROP TABLE IF EXISTS patients_logregr;
+DROP TABLE IF EXISTS patients_logregr, patients_logregr_summary;
SELECT madlib.robust_variance_logregr( 'patients',
'patients_logregr',
'second_attack',
@@ -335,7 +335,6 @@ SELECT madlib.robust_variance_logregr( 'patients',
-# View the regression results.
<pre class="example">
\\x on
-Expanded display is on.
SELECT * FROM patients_logregr;
</pre>
Result:
@@ -401,6 +400,7 @@ COPY sample_data FROM STDIN DELIMITER '|';
</pre>
-# Run the Cox regression function.
<pre class="example">
+DROP TABLE IF EXISTS sample_cox, sample_cox_summary;
SELECT madlib.coxph_train( 'sample_data',
'sample_cox',
'timedeath',
diff --git a/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in b/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in
index e8faa01a..df3acff4 100644
--- a/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in
+++ b/src/ports/postgres/modules/stats/cox_prop_hazards.sql_in
@@ -297,11 +297,6 @@ coxph_predict(model_table,
@anchor examples
@examp
--# View online help for the proportional hazards training method.
-<pre class="example">
-SELECT madlib.coxph_train();
-</pre>
-
-# Create an input data set.
<pre class="example">
DROP TABLE IF EXISTS sample_data;
@@ -341,6 +336,7 @@ COPY sample_data FROM STDIN WITH DELIMITER '|';
</pre>
-# Run the Cox regression function.
<pre class="example">
+DROP TABLE IF EXISTS sample_cox, sample_cox_summary;
SELECT madlib.coxph_train( 'sample_data',
'sample_cox',
'timedeath',
@@ -372,13 +368,14 @@ would be used.)
<pre class="example">
\\x off
-- Display the linear predictors for the original dataset
+DROP TABLE IF EXISTS sample_pred;
SELECT madlib.coxph_predict('sample_cox',
'sample_data',
'id',
'sample_pred');
+SELECT * FROM sample_pred;
</pre>
<pre class="result">
-SELECT * FROM sample_pred;
id | predicted_value
----+--------------------
0 | -2.97110918125034
@@ -409,11 +406,13 @@ SELECT * FROM sample_pred;
</pre>
<pre class="example">
-- Display the relative risk for the original dataset
+DROP TABLE IF EXISTS sample_pred;
SELECT madlib.coxph_predict('sample_cox',
'sample_data',
'id',
'sample_pred',
'risk');
+SELECT * FROM sample_pred;
</pre>
<pre class="result">
id | predicted_value
@@ -454,6 +453,7 @@ SELECT madlib.cox_zph( 'sample_cox',
</pre>
-# View results of the PHA test.
<pre class="example">
+\\x on
SELECT * FROM sample_zph_output;
</pre>
Results:
diff --git a/src/ports/postgres/modules/svm/svm.sql_in b/src/ports/postgres/modules/svm/svm.sql_in
index 8049fcf9..de426fdd 100644
--- a/src/ports/postgres/modules/svm/svm.sql_in
+++ b/src/ports/postgres/modules/svm/svm.sql_in
@@ -914,7 +914,7 @@ SELECT madlib.svm_regression( 'abalone',
'gaussian',
'n_components=10',
'',
- 'init_stepsize=[0.01,1], n_folds=3, max_iter=200, lambda=[0.01, 0.1, 0.5],
+ 'init_stepsize=[0.01,1], n_folds=3, max_iter=100, lambda=[0.01, 0.1, 0.5],
validation_result=abalone_svm_gaussian_regression_cv'
);
\\x on
@@ -922,10 +922,10 @@ SELECT * FROM abalone_svm_gaussian_regression;
</pre>
<pre class="result">
-[ RECORD 1 ]------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-coef | {4.46074154389204,2.19335800415975,-2.14775901092668,1.06805891149535,2.91168496475457,-3.95521278459095,4.20496790233169,-5.28144330907061,0.427743633754918,-3.58999505728692}
-loss | 2.68317592175908
-norm_of_gradient | 0.69852112502746
-num_iterations | 169
+coef | {4.78843909576373,1.82443514279869,-2.36814051329038,1.48570658714655,1.77828407859486,-6.38446590203159,3.99631488328669,-6.52441625480202,3.82342434407222,-3.67579802967837}
+loss | 2.5002301320973
+norm_of_gradient | 0.692031063542266
+num_iterations | 100
num_rows_processed | 20
num_rows_skipped | 0
dep_var_mapping | {NULL}
@@ -948,7 +948,7 @@ kernel_params | gamma=0.25, n_components=10,random_state=1, fit_intercept
grouping_col | NULL
optim_params | init_stepsize=1.0,
| decay_factor=0.9,
- | max_iter=200,
+ | max_iter=100,
| tolerance=1e-10,
| epsilon=0.01,
| eps_table=,
@@ -958,6 +958,7 @@ num_all_groups | 1
num_failed_groups | 0
total_rows_processed | 20
total_rows_skipped | 0
+
(6 rows)
</pre>
View the statistics for the various cross validation values:
@@ -967,13 +968,13 @@ SELECT * FROM abalone_svm_gaussian_regression_cv;
</pre>
<pre class="result">
init_stepsize | lambda | mean_score | std_dev_score
----------------+--------+----------------+----------------
- 1.0 | 0.01 | -4.06711568585 | 0.435966381366
- 1.0 | 0.1 | -4.08068428345 | 0.44660797513
- 1.0 | 0.5 | -4.52576046087 | 0.20597876382
- 0.01 | 0.01 | -11.0231044189 | 0.739956548721
- 0.01 | 0.1 | -11.0244799274 | 0.740029346709
- 0.01 | 0.5 | -11.0305445077 | 0.740350338532
+---------------+--------+----------------+---------------
+ 0.01 | 0.5 | -10.3718064203 | 1.70034854924
+ 1.0 | 0.5 | -4.03598164681 | 2.03645607616
+ 0.01 | 0.01 | -10.3424727426 | 1.70103511744
+ 0.01 | 0.1 | -10.3479312204 | 1.70090715482
+ 1.0 | 0.01 | -3.59999345219 | 1.80351978881
+ 1.0 | 0.1 | -3.74710586166 | 1.91455355654
(6 rows)
</pre>
-# Predict using the cross-validated Gaussian regression model:
@@ -993,7 +994,7 @@ JOIN abalone_gaussian_regr USING (id);
<pre class="result">
rms_error
------------------+
- 3.84208909699442
+ 3.64653492872019
(1 row)
</pre>
@@ -1003,7 +1004,7 @@ JOIN abalone_gaussian_regr USING (id);
Note that the dependent variable is not a parameter for one-class:
<pre class="example">
DROP TABLE IF EXISTS houses_one_class_gaussian, houses_one_class_gaussian_summary, houses_one_class_gaussian_random;
-select madlib.svm_one_class('houses',
+SELECT madlib.svm_one_class('houses',
'houses_one_class_gaussian',
'ARRAY[1,tax,bedroom,bath,size,lot,price]',
'gaussian',
diff --git a/src/ports/postgres/modules/validation/cross_validation.sql_in b/src/ports/postgres/modules/validation/cross_validation.sql_in
index 77b2c2b4..785c113f 100644
--- a/src/ports/postgres/modules/validation/cross_validation.sql_in
+++ b/src/ports/postgres/modules/validation/cross_validation.sql_in
@@ -264,7 +264,7 @@ SELECT madlib.cross_validation_general(
-- modelling_params
'{%%data%, %%model%, price, "array[tax, bath, size]", gaussian, 0.5, lambda, TRUE, NULL, fista,
"{eta = 2, max_stepsize = 2, use_active_set = t}",
- NULL, 10000, 1e-6}'::varchar[],
+ NULL, 200, 1e-6}'::varchar[],
-- modelling_params_type
'{varchar, varchar, varchar, varchar, varchar, double precision,
double precision, boolean, varchar, varchar, varchar, varchar,
@@ -304,8 +304,8 @@ Results from the lambda values explored:
<pre class="result">
lambda | mean_squared_error_avg | mean_squared_error_stddev
--------+------------------------+---------------------------
- 0.1 | 1194685622.1604 | 366687470.779826
- 0.2 | 1181768409.98238 | 352203200.758414
+ 0.1 | 1094965503.24269 | 411974996.039577
+ 0.2 | 1093350170.40664 | 411072137.632718
(2 rows)
</pre>
@@ -355,10 +355,10 @@ Results from the explored number of iterations:
<pre class="result">
max_iter | error_rate_avg | error_rate_stddev
----------+------------------------+--------------------------------------------
- 2 | 0.28214285714285714286 | 0.2053183193114972855562362870460638951565
- 10 | 0.25357142857142857143 | 0.1239753925550698688724699258837060065614
- 40 | 0.25357142857142857143 | 0.1239753925550698688724699258837060065614
- 100 | 0.25357142857142857143 | 0.1239753925550698688724699258837060065614
+ 2 | 0.19285714285714285714 | 0.1589185390091927774733662830554976076700
+ 10 | 0.22142857142857142857 | 0.1247446371183784331896638996881001527213
+ 40 | 0.22142857142857142857 | 0.1247446371183784331896638996881001527213
+ 100 | 0.22142857142857142857 | 0.1247446371183784331896638996881001527213
(4 rows)
</pre>