You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@madlib.apache.org by "Nikhil Kak (Jira)" <ji...@apache.org> on 2019/11/18 23:47:00 UTC
[jira] [Created] (MADLIB-1395) Term frequency and LDA - turn off notices

Nikhil Kak created MADLIB-1395:
----------------------------------

             Summary: Term frequency and LDA - turn off notices
                 Key: MADLIB-1395
                 URL: https://issues.apache.org/jira/browse/MADLIB-1395
             Project: Apache MADlib
          Issue Type: Bug
          Components: Module: Utilities
            Reporter: Nikhil Kak
             Fix For: v1.17


turn off these notices by using a MinWarning(“Error”) decorator in python

{code}
madlib=# SELECT madlib.term_frequency('documents', -- input table
madlib(# 'docid', -- document id column
madlib(# 'words', -- vector of words in document
madlib(# 'documents_tf', -- output documents table with term frequency
madlib(# TRUE); -- TRUE to created vocabulary table
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
CONTEXT: SQL statement "
 CREATE TABLE documents_tf_vocabulary AS
 SELECT (row_number() OVER (order by word))::INTEGER - 1 as wordid,
 word::TEXT
 FROM (
 SELECT distinct(words) as word
 FROM (
 SELECT unnest(words::TEXT[]) as words
 FROM documents
 ) q1
 ) q2
 "
PL/Python function "term_frequency"
NOTICE: One or more columns in the following table(s) do not have statistics: documents
HINT: For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
CONTEXT: SQL statement "
 CREATE TABLE documents_tf_vocabulary AS
 SELECT (row_number() OVER (order by word))::INTEGER - 1 as wordid,
 word::TEXT
 FROM (
 SELECT distinct(words) as word
 FROM (
 SELECT unnest(words::TEXT[]) as words
 FROM documents
 ) q1
 ) q2
 "
PL/Python function "term_frequency"
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'docid' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CONTEXT: SQL statement "
 CREATE TABLE documents_tf(
 docid INTEGER,
 wordid INTEGER,
 count INTEGER
 )
 "
PL/Python function "term_frequency"
NOTICE: One or more columns in the following table(s) do not have statistics: documents
HINT: For non-partitioned tables, run analyze <table_name>(<column_list>). For partitioned tables, run analyze rootpartition <table_name>(<column_list>). See log for columns missing statistics.
CONTEXT: SQL statement "
 INSERT INTO documents_tf
 SELECT docid, w.wordid as wordid, word_count as count
 FROM (
 SELECT docid, word::TEXT, count(*) as word_count
 FROM
 (
 SELECT docid, unnest(words::TEXT[]) as word
 FROM documents
 WHERE
 docid IS NOT NULL
 ) q1
 GROUP BY docid, word
 ) q2
 
 , documents_tf_vocabulary as w
 WHERE
 q2.word = w.word
 
 "
PL/Python function "term_frequency"
 term_frequency 
------------------------------------------------------------------------------------------
 Term frequency output in table documents_tf, vocabulary in table documents_tf_vocabulary
(1 row)
{code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)