You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/10 01:59:34 UTC
incubator-madlib git commit: Term Freq: Allow custom col names, avoid temp vocab

Repository: incubator-madlib
Updated Branches:
  refs/heads/master a6acafdd5 -> 5952569bf


Term Freq: Allow custom col names, avoid temp vocab

JIRA: MADLIB-933

- Fixed a minor bug that forced users to use "doc_id" as a column name.
- Fixed an incorrect temp table output for the vocabulary.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/5952569b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/5952569b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/5952569b

Branch: refs/heads/master
Commit: 5952569bff0d721a1c54a4b6ac9b60a64f0111e9
Parents: a6acafd
Author: Rahul Iyer <ri...@pivotal.io>
Authored: Mon Dec 7 14:01:04 2015 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Tue Feb 9 16:58:13 2016 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/utilities/text_utilities.py_in  | 6 +++---
 src/ports/postgres/modules/utilities/text_utilities.sql_in | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/5952569b/src/ports/postgres/modules/utilities/text_utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/text_utilities.py_in b/src/ports/postgres/modules/utilities/text_utilities.py_in
index de117e7..bddef25 100644
--- a/src/ports/postgres/modules/utilities/text_utilities.py_in
+++ b/src/ports/postgres/modules/utilities/text_utilities.py_in
@@ -41,7 +41,7 @@ def _create_tf_table(input_table, doc_id_col, word_vec_col,
     word_type = 'INTEGER' if vocab_table else 'TEXT'
     word_name = 'wordid' if vocab_table else 'word'
     plpy.execute("""
-        CREATE TEMP TABLE {output_table}(
+        CREATE TABLE {output_table}(
             {doc_id_col} INTEGER,
             {word_name} {word_type},
             count INTEGER
@@ -72,7 +72,7 @@ def _create_tf_table(input_table, doc_id_col, word_vec_col,
                     WHERE
                         {doc_id_col} IS NOT NULL
                 ) q1
-                GROUP BY docid, word
+                GROUP BY {doc_id_col}, word
             ) q2
             {inner_query}
         """.format(**locals()))
@@ -80,7 +80,7 @@ def _create_tf_table(input_table, doc_id_col, word_vec_col,
 
 
 def term_frequency(input_table, doc_id_col, word_vec_col,
-                   output_table, compute_vocab=None):
+                   output_table, compute_vocab=False):
 
     input_tbl_valid(input_table, "Term frequency")
     output_tbl_valid(output_table, "Term frequency")

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/5952569b/src/ports/postgres/modules/utilities/text_utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/text_utilities.sql_in b/src/ports/postgres/modules/utilities/text_utilities.sql_in
index c8f5aaf..5e039e6 100644
--- a/src/ports/postgres/modules/utilities/text_utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/text_utilities.sql_in
@@ -97,9 +97,8 @@ INSERT INTO documents VALUES
 
 -# Add a new column containing the words (lower-cased) in a text array
 <pre class="example">
-ALTER TABLE documents DROP COLUMN words;
 ALTER TABLE documents ADD COLUMN words TEXT[];
-UPDATE documents SET words = regexp_split_to_array(lower(doc_contents), E'[\\s+\\.]');
+UPDATE documents SET words = regexp_split_to_array(lower(doc_contents), E'[\\\\s+\\\\.]');
 </pre>
 
 -# Compute the frequency of each word in each document