You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2022/05/12 13:37:45 UTC
[madlib] 05/06: Doc: Update online examples for various unsupervised learning modules
This is an automated email from the ASF dual-hosted git repository.
okislal pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 5c70da3364fed4eee8601f77f41d10723ab984c2
Author: Orhan Kislal <ok...@apache.org>
AuthorDate: Mon May 2 11:26:27 2022 +0300
Doc: Update online examples for various unsupervised learning modules
---
.../modules/assoc_rules/assoc_rules.sql_in | 34 ++++++++++++----------
src/ports/postgres/modules/kmeans/kmeans.sql_in | 3 +-
src/ports/postgres/modules/lda/lda.sql_in | 10 +++++++
src/ports/postgres/modules/pca/pca.sql_in | 3 +-
4 files changed, 33 insertions(+), 17 deletions(-)
diff --git a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
index 9cf05a1d..73329728 100644
--- a/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
+++ b/src/ports/postgres/modules/assoc_rules/assoc_rules.sql_in
@@ -308,21 +308,22 @@ CREATE TABLE test_data (
trans_id INT,
product TEXT
);
-INSERT INTO test_data VALUES (1, 'beer');
-INSERT INTO test_data VALUES (1, 'diapers');
-INSERT INTO test_data VALUES (1, 'chips');
-INSERT INTO test_data VALUES (2, 'beer');
-INSERT INTO test_data VALUES (2, 'diapers');
-INSERT INTO test_data VALUES (3, 'beer');
-INSERT INTO test_data VALUES (3, 'diapers');
-INSERT INTO test_data VALUES (4, 'beer');
-INSERT INTO test_data VALUES (4, 'chips');
-INSERT INTO test_data VALUES (5, 'beer');
-INSERT INTO test_data VALUES (6, 'beer');
-INSERT INTO test_data VALUES (6, 'diapers');
-INSERT INTO test_data VALUES (6, 'chips');
-INSERT INTO test_data VALUES (7, 'beer');
-INSERT INTO test_data VALUES (7, 'diapers');
+INSERT INTO test_data VALUES
+(1, 'beer'),
+(1, 'diapers'),
+(1, 'chips'),
+(2, 'beer'),
+(2, 'diapers'),
+(3, 'beer'),
+(3, 'diapers'),
+(4, 'beer'),
+(4, 'chips'),
+(5, 'beer'),
+(6, 'beer'),
+(6, 'diapers'),
+(6, 'chips'),
+(7, 'beer'),
+(7, 'diapers');
</pre>
-# Let \f$ min(support) = .25 \f$ and \f$ min(confidence) = .5 \f$, and the
@@ -331,6 +332,7 @@ In this example we set verbose to
TRUE so that we have some insight into progress of the function. We
can now generate association rules as follows:
<pre class="example">
+DROP TABLE IF EXISTS assoc_rules;
SELECT * FROM madlib.assoc_rules( .25, -- Support
.5, -- Confidence
'trans_id', -- Transaction id col
@@ -369,6 +371,7 @@ Result:
-# Limit association rules generated from itemsets of size at most 2. This parameter is
a good way to reduce long run times.
<pre class="example">
+DROP TABLE IF EXISTS assoc_rules;
SELECT * FROM madlib.assoc_rules( .25, -- Support
.5, -- Confidence
'trans_id', -- Transaction id col
@@ -421,6 +424,7 @@ Result:
-# Limit the size of right hand side to 1. This parameter is a good way to
reduce long run times.
<pre class="example">
+DROP TABLE IF EXISTS assoc_rules;
SELECT * FROM madlib.assoc_rules( .25, -- Support
.5, -- Confidence
'trans_id', -- Transaction id col
diff --git a/src/ports/postgres/modules/kmeans/kmeans.sql_in b/src/ports/postgres/modules/kmeans/kmeans.sql_in
index 30031488..5a45d059 100644
--- a/src/ports/postgres/modules/kmeans/kmeans.sql_in
+++ b/src/ports/postgres/modules/kmeans/kmeans.sql_in
@@ -546,6 +546,7 @@ SELECT * FROM madlib.simple_silhouette_points( 'km_sample', -- Input po
'centroids', -- Column in centroids table containing centroids
'madlib.squared_dist_norm2' -- Distance function
);
+\\x off
SELECT * FROM km_points_silh ORDER BY pid;
</pre>
<pre class="result">
@@ -567,7 +568,6 @@ SELECT * FROM km_points_silh ORDER BY pid;
-# Find the cluster assignment for each point.
Use the closest_column() function to map each point to the cluster that it belongs to.
<pre class="example">
-\\x off;
DROP TABLE IF EXISTS point_cluster_map;
CREATE TABLE point_cluster_map AS
SELECT data.*, (madlib.closest_column(centroids, points, 'madlib.squared_dist_norm2')).*
@@ -785,6 +785,7 @@ SELECT * FROM madlib.simple_silhouette_points( 'km_sample', -- Input po
(SELECT centroids FROM km_result_auto WHERE k=3), -- centroids array
'madlib.squared_dist_norm2' -- Distance function
);
+\\x off
SELECT * FROM km_points_silh ORDER BY pid;
</pre>
<pre class="result">
diff --git a/src/ports/postgres/modules/lda/lda.sql_in b/src/ports/postgres/modules/lda/lda.sql_in
index 32b22fd6..0d3279f9 100644
--- a/src/ports/postgres/modules/lda/lda.sql_in
+++ b/src/ports/postgres/modules/lda/lda.sql_in
@@ -497,6 +497,7 @@ UPDATE documents SET words =
regexp_split_to_array(lower(
regexp_replace(contents, E'[,.;\']','', 'g')
), E'[\\\\s+]');
+\\x on
SELECT * FROM documents ORDER BY docid;
</pre>
<pre class="result">
@@ -526,6 +527,7 @@ SELECT madlib.term_frequency('documents', -- input table
'words', -- vector of words in document
'documents_tf', -- output documents table with term frequency
TRUE); -- TRUE to created vocabulary table
+\\x off
SELECT * FROM documents_tf ORDER BY docid LIMIT 20;
</pre>
<pre class="result">
@@ -612,6 +614,7 @@ SELECT madlib.lda_train( 'documents_tf', -- documents table in the form of t
5, -- Dirichlet prior for the per-doc topic multinomial (alpha)
0.01 -- Dirichlet prior for the per-topic word multinomial (beta)
);
+\\x on
SELECT * FROM lda_output_data ORDER BY docid;
</pre>
<pre class="result">
@@ -671,6 +674,7 @@ SELECT madlib.lda_get_topic_desc( 'lda_model', -- LDA model gener
'documents_tf_vocabulary', -- vocabulary table that maps wordid to word
'helper_output_table', -- output table for per-topic descriptions
5); -- k: number of top words for each topic
+\\x off
SELECT * FROM helper_output_table ORDER BY topicid, prob DESC LIMIT 40;
</pre>
<pre class="result">
@@ -758,6 +762,7 @@ which words are associated with each topic by frequency:
DROP TABLE IF EXISTS topic_word_count;
SELECT madlib.lda_get_topic_word_count( 'lda_model',
'topic_word_count');
+\\x on
SELECT * FROM topic_word_count ORDER BY topicid;
</pre>
<pre class="result">
@@ -782,6 +787,7 @@ Get the per-document word to topic mapping:
DROP TABLE IF EXISTS helper_output_table;
SELECT madlib.lda_get_word_topic_mapping('lda_output_data', -- Output table from training
'helper_output_table');
+\\x off
SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
</pre>
<pre class="result">
@@ -840,6 +846,7 @@ SELECT madlib.lda_predict( 'documents_tf', -- Document to predict
'lda_model', -- LDA model from training
'outdata_predict' -- Output table for predict results
);
+\\x on
SELECT * FROM outdata_predict;
</pre>
<pre class="result">
@@ -883,6 +890,7 @@ per-document word to topic mapping that we used on the learned model.)
DROP TABLE IF EXISTS helper_output_table;
SELECT madlib.lda_get_word_topic_mapping('outdata_predict', -- Output table from prediction
'helper_output_table');
+\\x off
SELECT * FROM helper_output_table ORDER BY docid LIMIT 40;
</pre>
<pre class="result">
@@ -961,6 +969,7 @@ SELECT madlib.lda_train( 'documents_tf', -- documents table in the form
2, -- Evaluate perplexity every n iterations
0.3 -- Tolerance to stop iteration
);
+\\x on
SELECT voc_size, topic_num, alpha, beta, num_iterations, perplexity, perplexity_iters from lda_model_perp;
</pre>
<pre class="result">
@@ -978,6 +987,7 @@ perplexity values because we computed it only every 2nd iteration to save time.
As expected, the perplexity
on the training data is that same as the final iteration value:
<pre class="example">
+\\x off
SELECT madlib.lda_get_perplexity( 'lda_model_perp',
'lda_output_data_perp'
);
diff --git a/src/ports/postgres/modules/pca/pca.sql_in b/src/ports/postgres/modules/pca/pca.sql_in
index 92b74e2a..c4f95e73 100644
--- a/src/ports/postgres/modules/pca/pca.sql_in
+++ b/src/ports/postgres/modules/pca/pca.sql_in
@@ -289,7 +289,9 @@ This sumary table has the following columns:
-# View online help for the PCA training functions:
<pre class="example">
SELECT madlib.pca_train();
+</pre>
or
+<pre class="example">
SELECT madlib.pca_sparse_train();
</pre>
@@ -324,7 +326,6 @@ SELECT * FROM result_table ORDER BY row_id;
-# Run the PCA function for a specified proportion of variance and view the results:
<pre class="example">
-%%sql
DROP TABLE IF EXISTS result_table, result_table_mean;
SELECT madlib.pca_train('mat', -- Source table
'result_table', -- Output table