You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2016/02/10 01:59:34 UTC
incubator-madlib git commit: Term Freq: Allow custom col names,
avoid temp vocab
Repository: incubator-madlib
Updated Branches:
refs/heads/master a6acafdd5 -> 5952569bf
Term Freq: Allow custom col names, avoid temp vocab
JIRA: MADLIB-933
- Fixed a minor bug that forced users to use "doc_id" as a column name.
- Fixed an incorrect temp table output for the vocabulary.
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/5952569b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/5952569b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/5952569b
Branch: refs/heads/master
Commit: 5952569bff0d721a1c54a4b6ac9b60a64f0111e9
Parents: a6acafd
Author: Rahul Iyer <ri...@pivotal.io>
Authored: Mon Dec 7 14:01:04 2015 -0800
Committer: Rahul Iyer <ri...@pivotal.io>
Committed: Tue Feb 9 16:58:13 2016 -0800
----------------------------------------------------------------------
src/ports/postgres/modules/utilities/text_utilities.py_in | 6 +++---
src/ports/postgres/modules/utilities/text_utilities.sql_in | 3 +--
2 files changed, 4 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/5952569b/src/ports/postgres/modules/utilities/text_utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/text_utilities.py_in b/src/ports/postgres/modules/utilities/text_utilities.py_in
index de117e7..bddef25 100644
--- a/src/ports/postgres/modules/utilities/text_utilities.py_in
+++ b/src/ports/postgres/modules/utilities/text_utilities.py_in
@@ -41,7 +41,7 @@ def _create_tf_table(input_table, doc_id_col, word_vec_col,
word_type = 'INTEGER' if vocab_table else 'TEXT'
word_name = 'wordid' if vocab_table else 'word'
plpy.execute("""
- CREATE TEMP TABLE {output_table}(
+ CREATE TABLE {output_table}(
{doc_id_col} INTEGER,
{word_name} {word_type},
count INTEGER
@@ -72,7 +72,7 @@ def _create_tf_table(input_table, doc_id_col, word_vec_col,
WHERE
{doc_id_col} IS NOT NULL
) q1
- GROUP BY docid, word
+ GROUP BY {doc_id_col}, word
) q2
{inner_query}
""".format(**locals()))
@@ -80,7 +80,7 @@ def _create_tf_table(input_table, doc_id_col, word_vec_col,
def term_frequency(input_table, doc_id_col, word_vec_col,
- output_table, compute_vocab=None):
+ output_table, compute_vocab=False):
input_tbl_valid(input_table, "Term frequency")
output_tbl_valid(output_table, "Term frequency")
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/5952569b/src/ports/postgres/modules/utilities/text_utilities.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/text_utilities.sql_in b/src/ports/postgres/modules/utilities/text_utilities.sql_in
index c8f5aaf..5e039e6 100644
--- a/src/ports/postgres/modules/utilities/text_utilities.sql_in
+++ b/src/ports/postgres/modules/utilities/text_utilities.sql_in
@@ -97,9 +97,8 @@ INSERT INTO documents VALUES
-# Add a new column containing the words (lower-cased) in a text array
<pre class="example">
-ALTER TABLE documents DROP COLUMN words;
ALTER TABLE documents ADD COLUMN words TEXT[];
-UPDATE documents SET words = regexp_split_to_array(lower(doc_contents), E'[\\s+\\.]');
+UPDATE documents SET words = regexp_split_to_array(lower(doc_contents), E'[\\\\s+\\\\.]');
</pre>
-# Compute the frequency of each word in each document