You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2022/05/12 13:37:45 UTC

[madlib] 05/06: Doc: Update online examples for various unsupervised learning modules

This is an automated email from the ASF dual-hosted git repository.

okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit 5c70da3364fed4eee8601f77f41d10723ab984c2
Author: Orhan Kislal <ok...@apache.org>
AuthorDate: Mon May 2 11:26:27 2022 +0300

    Doc: Update online examples for various unsupervised learning modules
---
 .../modules/assoc_rules/assoc_rules.sql_in         | 34 ++++++++++++----------
 src/ports/postgres/modules/kmeans/kmeans.sql_in    |  3 +-
 src/ports/postgres/modules/lda/lda.sql_in          | 10 +++++++
 src/ports/postgres/modules/pca/pca.sql_in          |  3 +-
 4 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
index 9cf05a1d..73329728 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
@@ -308,21 +308,22 @@ CREATE TABLE test_data (
     trans_id INT,
     product TEXT
 );
-INSERT INTO test_data VALUES (1, 'beer');
-INSERT INTO test_data VALUES (1, 'diapers');
-INSERT INTO test_data VALUES (1, 'chips');
-INSERT INTO test_data VALUES (2, 'beer');
-INSERT INTO test_data VALUES (2, 'diapers');
-INSERT INTO test_data VALUES (3, 'beer');
-INSERT INTO test_data VALUES (3, 'diapers');
-INSERT INTO test_data VALUES (4, 'beer');
-INSERT INTO test_data VALUES (4, 'chips');
-INSERT INTO test_data VALUES (5, 'beer');
-INSERT INTO test_data VALUES (6, 'beer');
-INSERT INTO test_data VALUES (6, 'diapers');
-INSERT INTO test_data VALUES (6, 'chips');
-INSERT INTO test_data VALUES (7, 'beer');
-INSERT INTO test_data VALUES (7, 'diapers');
+INSERT INTO test_data VALUES
+(1, 'beer'),
+(1, 'diapers'),
+(1, 'chips'),
+(2, 'beer'),
+(2, 'diapers'),
+(3, 'beer'),
+(3, 'diapers'),
+(4, 'beer'),
+(4, 'chips'),
+(5, 'beer'),
+(6, 'beer'),
+(6, 'diapers'),
+(6, 'chips'),
+(7, 'beer'),
+(7, 'diapers');
 </pre>
 
 -# Let \f$ min(support) = .25 \f$ and \f$ min(confidence) = .5 \f$, and the
@@ -331,6 +332,7 @@ In this example we set verbose to
 TRUE so that we have some insight into progress of the function. We
 can now generate association rules as follows:
 <pre class="example">
+DROP TABLE IF EXISTS assoc_rules;
 SELECT * FROM madlib.assoc_rules( .25,            -- Support
                                   .5,             -- Confidence
                                   'trans_id',     -- Transaction id col
@@ -369,6 +371,7 @@ Result:
 -# Limit association rules generated from itemsets of size at most 2.  This parameter is
 a good way to reduce long run times.
 <pre class="example">
+DROP TABLE IF EXISTS assoc_rules;
 SELECT * FROM madlib.assoc_rules( .25,            -- Support
                                   .5,             -- Confidence
                                   'trans_id',     -- Transaction id col
@@ -421,6 +424,7 @@ Result:
 -# Limit the size of right hand side to 1.  This parameter is a good way to
 reduce long run times.
 <pre class="example">
+DROP TABLE IF EXISTS assoc_rules;
 SELECT * FROM madlib.assoc_rules( .25,            -- Support
                                   .5,             -- Confidence
                                   'trans_id',     -- Transaction id col
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 30031488..5a45d059 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -546,6 +546,7 @@ SELECT * FROM madlib.simple_silhouette_points( 'km_sample',          -- Input po
                                               'centroids',           -- Column in centroids table containing centroids
                                               'madlib.squared_dist_norm2'   -- Distance function
                                       );
+\\x off
 SELECT * FROM km_points_silh ORDER BY pid;
 </pre>
 <pre class="result">
@@ -567,7 +568,6 @@ SELECT * FROM km_points_silh ORDER BY pid;
 -#  Find the cluster assignment for each point.
 Use the closest_column() function to map each point to the cluster that it belongs to.
 <pre class="example">
-\\x off;
 DROP TABLE IF EXISTS point_cluster_map;
 CREATE TABLE point_cluster_map AS
 SELECT data.*, (madlib.closest_column(centroids, points, 'madlib.squared_dist_norm2')).*
@@ -785,6 +785,7 @@ SELECT * FROM madlib.simple_silhouette_points( 'km_sample',          -- Input po
                                               (SELECT centroids FROM km_result_auto WHERE k=3), -- centroids array
                                               'madlib.squared_dist_norm2'   -- Distance function
                                       );
+\\x off
 SELECT * FROM km_points_silh ORDER BY pid;
 </pre>
 <pre class="result">
diff --git a/src/ports/postgres/modules/lda/lda.sql_in b/src/ports/postgres/modules/lda/lda.sql_in
index 32b22fd6..0d3279f9 100644
--- a/src/ports/postgres/modules/lda/lda.sql_in
+++ b/src/ports/postgres/modules/lda/lda.sql_in
@@ -497,6 +497,7 @@ UPDATE documents SET words =
     regexp_split_to_array(lower(
     regexp_replace(contents, E'[,.;\']','', 'g')
     ), E'[\\\\s+]');
+\\x on
 SELECT * FROM documents ORDER BY docid;
 </pre>
 <pre class="result">
@@ -526,6 +527,7 @@ SELECT madlib.term_frequency('documents',    -- input table
                              'words',        -- vector of words in document
                              'documents_tf', -- output documents table with term frequency
                              TRUE);          -- TRUE to created vocabulary table
+\\x off
 SELECT * FROM documents_tf ORDER BY docid LIMIT 20;
 </pre>
 <pre class="result">
@@ -612,6 +614,7 @@ SELECT madlib.lda_train( 'documents_tf',     -- documents table in the form of t
                          5,                  -- Dirichlet prior for the per-doc topic multinomial (alpha)
                          0.01                -- Dirichlet prior for the per-topic word multinomial (beta)
                        );
+\\x on
 SELECT * FROM lda_output_data ORDER BY docid;
 </pre>
 <pre class="result">
@@ -671,6 +674,7 @@ SELECT madlib.lda_get_topic_desc( 'lda_model',                -- LDA model gener
                                   'documents_tf_vocabulary',  -- vocabulary table that maps wordid to word
                                   'helper_output_table',      -- output table for per-topic descriptions
                                   5);                         -- k: number of top words for each topic
+\\x off
 SELECT * FROM helper_output_table ORDER BY topicid, prob DESC LIMIT 40;
 </pre>
 <pre class="result">
@@ -758,6 +762,7 @@ which words are associated with each topic by frequency:
 DROP TABLE IF EXISTS topic_word_count;
 SELECT madlib.lda_get_topic_word_count( 'lda_model',
                                         'topic_word_count');
+\\x on
 SELECT * FROM topic_word_count ORDER BY topicid;
 </pre>
 <pre class="result">
@@ -782,6 +787,7 @@ Get the per-document word to topic mapping:
 DROP TABLE IF EXISTS helper_output_table;
 SELECT madlib.lda_get_word_topic_mapping('lda_output_data',  -- Output table from training
                                          'helper_output_table');
+\\x off
 SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
 </pre>
 <pre class="result">
@@ -840,6 +846,7 @@ SELECT madlib.lda_predict( 'documents_tf',          -- Document to predict
                            'lda_model',             -- LDA model from training
                            'outdata_predict'        -- Output table for predict results
                          );
+\\x on
 SELECT * FROM outdata_predict;
 </pre>
 <pre class="result">
@@ -883,6 +890,7 @@ per-document word to topic mapping that we used on the learned model.)
 DROP TABLE IF EXISTS helper_output_table;
 SELECT madlib.lda_get_word_topic_mapping('outdata_predict',  -- Output table from prediction
                                          'helper_output_table');
+\\x off
 SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
 </pre>
 <pre class="result">
@@ -961,6 +969,7 @@ SELECT madlib.lda_train( 'documents_tf',          -- documents table in the form
                          2,                       -- Evaluate perplexity every n iterations
                          0.3                      -- Tolerance to stop iteration
                        );
+\\x on
 SELECT voc_size, topic_num, alpha, beta, num_iterations, perplexity, perplexity_iters from lda_model_perp;
 </pre>
 <pre class="result">
@@ -978,6 +987,7 @@ perplexity values because we computed it only every 2nd iteration to save time.
 As expected, the perplexity
 on the training data is that same as the final iteration value:
 <pre class="example">
+\\x off
 SELECT madlib.lda_get_perplexity( 'lda_model_perp',
                                   'lda_output_data_perp'
                                 );
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index 92b74e2a..c4f95e73 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -289,7 +289,9 @@ This sumary table has the following columns:
 -# View online help for the PCA training functions:
 <pre class="example">
 SELECT madlib.pca_train();
+</pre>
 or
+<pre class="example">
 SELECT madlib.pca_sparse_train();
 </pre>
 
@@ -324,7 +326,6 @@ SELECT * FROM result_table ORDER BY row_id;
 
 -# Run the PCA function for a specified proportion of variance and view the results:
 <pre class="example">
-%%sql
 DROP TABLE IF EXISTS result_table, result_table_mean;
 SELECT madlib.pca_train('mat',             -- Source table
                         'result_table',    -- Output table