You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@madlib.apache.org by ri...@apache.org on 2018/07/16 06:36:34 UTC

[1/2] madlib git commit: Utilities: Add cols2vec() to convert columns to array

Repository: madlib
Updated Branches:
  refs/heads/master 3b527b82a -> 950114ccd


Utilities: Add cols2vec() to convert columns to array

JIRA: MADLIB-1239

This commit adds a new function called cols2vec that can be used to
convert features from multiple columns of an input table into a feature
array in a single column.


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/2828d86a
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/2828d86a
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/2828d86a

Branch: refs/heads/master
Commit: 2828d86a64bedddb6849913eaaf7734042922e6e
Parents: 3b527b8
Author: Himanshu Pandey <hp...@pivotal.io>
Authored: Fri Jun 15 01:33:27 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Sun Jul 15 23:35:14 2018 -0700

----------------------------------------------------------------------
 doc/mainpage.dox.in                             |   3 +
 .../postgres/modules/utilities/cols2vec.py_in   | 111 +++++++++++
 .../postgres/modules/utilities/cols2vec.sql_in  | 191 +++++++++++++++++++
 .../modules/utilities/test/cols2vec.sql_in      |  89 +++++++++
 4 files changed, 394 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/2828d86a/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index e41e6c9..341f115 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -284,6 +284,9 @@ Contains graph algorithms.
     @defgroup @grp_utilities Utilities
     @ingroup grp_other_functions
 
+    @defgroup grp_cols2vec Columns to Vector
+    @ingroup grp_utility_functions
+
 @defgroup grp_early_stage Early Stage Development
 @brief A collection of implementations which are in early stage of development.
 There may be some issues that will be addressed in a future version.

http://git-wip-us.apache.org/repos/asf/madlib/blob/2828d86a/src/ports/postgres/modules/utilities/cols2vec.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/cols2vec.py_in b/src/ports/postgres/modules/utilities/cols2vec.py_in
new file mode 100644
index 0000000..ced53e9
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/cols2vec.py_in
@@ -0,0 +1,111 @@
+"""
+@file cols2vec.py_in
+
+@brief Utility to convert Columns to array
+
+"""
+
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import split_quoted_delimited_str
+from utilities.utilities import _string_to_array
+from utilities.utilities import _assert
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import is_var_valid
+from utilities.validate_args import get_cols
+from utilities.validate_args import quote_ident
+from utilities.utilities import py_list_to_sql_string
+
+
+m4_changequote(`<!', `!>')
+
+
+def validate_cols2vec_args(source_table, output_table,
+                           list_of_features, list_of_features_to_exclude, cols_to_output, **kwargs):
+    """
+        Function to validate input parameters
+    """
+    if list_of_features.strip() != '*':
+        if not (list_of_features and list_of_features.strip()):
+            plpy.error("Features to include is empty")
+        _assert(
+            columns_exist_in_table(
+                source_table, split_quoted_delimited_str(list_of_features)),
+            "Invalid columns to list of features {0}".format(list_of_features))
+
+    if cols_to_output and cols_to_output.strip() != '*':
+        _assert(
+            columns_exist_in_table(
+                source_table, _string_to_array(cols_to_output)),
+            "Invalid columns to output list {0}".format(cols_to_output))
+
+
+def cols2vec(schema_madlib, source_table, output_table, list_of_features,
+             list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
+    """
+    Args:
+        @param schema_madlib:               Name of MADlib schema
+        @param model:                       Name of table containing the tree model
+        @param source_table:                Name of table containing prediction data
+        @param output_table:                Name of table to output the results
+        @param list_of_features:            Comma-separated string of column names or
+                                            expressions to put into feature array.
+                                            Can also be a '*' implying all columns
+                                            are to be put into feature array.
+        @param list_of_features_to_exclude: Comma-separated string of column names
+                                            to exclude from the feature array
+        @param cols_to_output:              Comma-separated string of column names
+                                            from the source table to keep in the output table,
+                                            in addition to the feature array.
+
+    Returns:
+        None
+
+    """
+
+    with MinWarning('warning'):
+        validate_cols2vec_args(source_table, output_table, list_of_features,
+                               list_of_features_to_exclude, cols_to_output, **kwargs)
+
+        if list_of_features.strip() == '*':
+            all_cols = get_cols(source_table, schema_madlib)
+            exclude_set = set(split_quoted_delimited_str(
+                list_of_features_to_exclude))
+            feature_list = [col for col in all_cols if col not in exclude_set]
+
+        else:
+            feature_list = split_quoted_delimited_str(list_of_features)
+
+        feature_cols = py_list_to_sql_string(
+            list(feature_list), "text", False)
+        filtered_list_of_features = ",".join(feature_list)
+
+        output_cols = ''
+        if cols_to_output:
+            output_cols_list = [', '.join(get_cols(source_table, schema_madlib)) if col == '*' else col
+                                for col in split_quoted_delimited_str(cols_to_output)]
+            output_cols = ', '.join(output_cols_list) + ","
+
+        plpy.execute("""
+    		CREATE TABLE {output_table} AS
+    		select {output_cols}
+            array[{filtered_list_of_features}] as feature_vector
+    		from {source_table}
+    		""".format(**locals()))
+
+        plpy.execute("""
+            CREATE TABLE {output_table}_summary
+            (
+            source_table TEXT,
+            list_of_features TEXT,
+            list_of_features_to_exclude TEXT,
+            feature_cols TEXT[]
+            )""".format(output_table=output_table))
+
+        plpy.execute("""
+            INSERT INTO {output_table}_summary
+            VALUES ('{source_table}','{list_of_features}',
+            '{list_of_features_to_exclude}',
+            (SELECT {feature_cols} as feature_names from {source_table} limit 1))
+            """.format( **locals()))
+        return

http://git-wip-us.apache.org/repos/asf/madlib/blob/2828d86a/src/ports/postgres/modules/utilities/cols2vec.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/cols2vec.sql_in b/src/ports/postgres/modules/utilities/cols2vec.sql_in
new file mode 100644
index 0000000..035b992
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/cols2vec.sql_in
@@ -0,0 +1,191 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------- *//**
+*
+*file cols2vec.sql_in
+*
+*brief A set of utilities to ease basic table transformations (such as *aggregating multiple columns in an array)
+*
+*
+*//* ----------------------------------------------------------------------- */
+ m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_cols2vec
+
+@brief Create a new table with all feature columns inserted into a single column as an array
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li class="level1"><a href="#cols2vec_syntax">Syntax</a>
+<li class="level1"><a href="#cols2vec_usage">Usage</a>
+<li class="level1"><a href="#cols2vec_example">Example</a>
+</ul>
+</div>
+
+@about
+Convert all feature columns in your table into an array in a single column.
+
+Given a table with varying number of columns, this function will create an output table that will contain the feature columns into an array. A summry table will be created and will the names
+of the features into array so that this process can be reversed using the function
+vec2cols from array_utilities in PDLTools.
+
+The columns that need NOT be included in the feature array need to be specified in the exclude_columns field.
+
+@anchor cols2vec_usage
+@usage
+
+<pre class="syntax">
+cols2vec(
+    source_table ,
+    output_table ,
+    list_of_features,
+    list_of_features_to_exclude ,
+    cols_to_output  
+) 
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>source_table</dt>
+<dd>TEXT. Name of the table containing the source data.</tt>.
+
+<dt>output_table</dt>
+<dd>TEXT. Name of the generated table containing the output.</tt>
+
+<dt>list_of_features</dt>
+<dd>TEXT. Comma-separated string of column names or expressions to put into feature array. Can also be a '*' implying all columns are to be put into feature array (except for the ones included in the next argument that lists exclusions). Array columns in the source table are not supported in the 'list_of_features'. </tt>
+
+<dt>list_of_features_to_exclude</dt>
+<dd>TEXT. Default NULL. Comma-separated string of column names to exclude from the feature array.  Use only when 'list_of_features' is '*'. </tt>
+
+<dt>cols_to_output</dt>
+<dd>TEXT. Default NULL. Comma-separated string of column names from the source table to keep in the output table, in addition to the feature array.  To keep all columns from the source table, use '*'. </tt>
+
+</dd>
+
+
+@anchor cols2vec_example
+@examp
+
+<pre class="syntax">
+
+DROP TABLE IF EXISTS cols2vec;
+CREATE TABLE cols2vec (
+    id              bigint,
+    label           int,
+    feat1           int,
+    feat2           int,
+    feat3           float,
+    other_col       float
+);
+
+INSERT INTO cols2vec  VALUES
+(1, 0, 1, 1, 0.5, 0.9),
+(2, 1, 0, 1, 0.3, 0.3),
+(3, 0, 0, 0, 0.1, 1.1),
+(4, 1, 1, 0, 0.9, 0.4);
+
+
+<pre>
+
+<pre class="syntax">
+drop table if exists cols2vec_result;
+drop table if exists cols2vec_result_summary;
+
+
+select cols2vec(
+      'cols2vec',                  -- input table
+      'cols2vec_result',		       -- Output table
+      'feat1,feat2,feat3',         -- Comma Seperated List of Features
+      'id',                        --  Features To Exclude
+      'id,label'	 			           -- Output columns to be included in
+      								                output table
+);
+
+select * from cols2vec_result;
+</pre>
+
+
+-# Expected output:
+
+<pre class="result">
+select * from cols2vec_result;
+
+ id | label | feature_vector 
+----+-------+----------------
+  1 |     0 | {1,1,0.5}
+  2 |     1 | {0,1,0.3}
+  3 |     0 | {0,0,0.1}
+  4 |     1 | {1,0,0.9}
+(4 rows)
+
+select * from cols2vec_result_summary;
+    feature_names    
+---------------------
+ {feat1,feat2,feat3}
+ {feat1,feat2,feat3}
+ {feat1,feat2,feat3}
+ {feat1,feat2,feat3}
+(4 rows)
+
+</pre>
+
+
+<pre class="syntax">
+</pre>
+*/
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
+    source_table VARCHAR,
+    output_table VARCHAR,
+    list_of_features VARCHAR,
+    list_of_features_to_exclude VARCHAR,
+    cols_to_output VARCHAR 
+) RETURNS void AS $$
+    PythonFunction(cols_vec, cols2vec, cols2vec)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');    
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
+    source_table VARCHAR,
+    output_table VARCHAR,
+    list_of_features VARCHAR,
+    list_of_features_to_exclude VARCHAR
+) RETURNS void AS $$
+SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,$4,NULL)
+
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
+    source_table VARCHAR,
+    output_table VARCHAR,
+    list_of_features VARCHAR
+) RETURNS void AS $$
+SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,NULL,NULL)
+
+$$ LANGUAGE SQL
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

http://git-wip-us.apache.org/repos/asf/madlib/blob/2828d86a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in b/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
new file mode 100644
index 0000000..33fe024
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
@@ -0,0 +1,89 @@
+/* ----------------------------------------------------------------------- *//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- */
+
+DROP TABLE IF EXISTS cols2vec;
+CREATE TABLE cols2vec (
+    id              bigint,
+    label           int,
+    feat1           int,
+    feat2           int,
+    feat3           float,
+    other_col       float
+);
+
+INSERT INTO cols2vec  VALUES
+(1, 0, 1, 1, 0.5, 0.9),
+(2, 1, 0, 1, 0.3, 0.3),
+(3, 0, 0, 0, 0.1, 1.1),
+(4, 1, 1, 0, 0.9, 0.4);
+
+
+DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+
+select cols2vec( 'cols2vec',  'cols2vec_out','feat1,feat2,feat3', 'id', 'id,label' );
+
+select assert(feature_vector = '{1,1,0.5}','Incorrect results for cols2vec') from cols2vec_out where id = 1;
+
+
+
+DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+
+select cols2vec( 'cols2vec',  'cols2vec_out','*', 'id,other_col', 'id,label' );
+
+
+select assert(feature_vector = '{1,1,0.5,0}','Incorrect results for cols2vec') from cols2vec_out where id = 1;
+
+
+DROP TABLE IF EXISTS cols2vec_spcl;
+CREATE TABLE cols2vec_spcl (
+    "i,Ж!#'d"              	bigint,
+    "lab$$''%*Ж!#''()el"    int,
+    "fe''at1"           	int,
+    feat2           		int,
+    "fe'%*()at3"           	float,
+    other_col       		float,
+    "se''x" 				TEXT
+
+
+);
+
+INSERT INTO cols2vec_spcl  VALUES
+(1, 0, 1, 1, 0.5, 0.9,'M''M'),
+(2, 1, 0, 1, 0.3, 0.3,'M$M'),
+(3, 0, 0, 0, 0.1, 1.1,'M,M'),
+(4, 1, 1, 0, 0.9, 0.4,'M@[}(:*;M'),
+(5, 1, 0, 1, 0.85, 0.34,'M@[}(:*;M'),
+(6, 1, 0, 1, 0.63, 0.12,'M"M'),
+(7, 0, 1, 0, 0.7, 1.4,'MЖM');
+
+DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+
+select cols2vec( 'cols2vec_spcl',  'cols2vec_out','"fe''''at1",feat2,"fe''%*()at3"', Null, '"i,Ж!#''d","lab$$''''%*Ж!#''''()el"' );
+
+select assert(feature_vector = '{1,1,0.5}','Incorrect results for cols2vec') from cols2vec_out where "i,Ж!#'d" = 1;
+
+
+DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+
+select cols2vec( 'cols2vec_spcl',  'cols2vec_out','*', '"se''''x"', '"i,Ж!#''d","lab$$''''%*Ж!#''''()el"' );
+select assert(feature_vector = '{0.5,1,0,0.9,1,1}','Incorrect results for cols2vec') from cols2vec_out where "i,Ж!#'d" = 1;
+
+

[2/2] madlib git commit: Utilties: Refactor and clean cols2vec from 2828d86

Posted by ri...@apache.org.

Utilties: Refactor and clean cols2vec from 2828d86

JIRA: MADLIB-1239

Closes #288


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/950114cc
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/950114cc
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/950114cc

Branch: refs/heads/master
Commit: 950114ccdbbdd81750624a41390d5a35d11c008a
Parents: 2828d86
Author: Rahul Iyer <ri...@apache.org>
Authored: Thu Jul 12 16:44:57 2018 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Sun Jul 15 23:36:01 2018 -0700

----------------------------------------------------------------------
 doc/mainpage.dox.in                             |   5 +-
 .../postgres/modules/utilities/cols2vec.py_in   | 110 ++++++------
 .../postgres/modules/utilities/cols2vec.sql_in  | 173 ++++++++++---------
 .../modules/utilities/test/cols2vec.sql_in      |  54 +++---
 .../postgres/modules/utilities/utilities.py_in  |   6 +-
 5 files changed, 183 insertions(+), 165 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/950114cc/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 341f115..c2c9a7a 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -262,6 +262,9 @@ Contains graph algorithms.
 
 
 @defgroup grp_other_functions Other Functions
+    @defgroup grp_cols2vec Columns to Vector
+    @ingroup grp_other_functions
+
     @defgroup grp_linear_solver Linear Solvers
     @ingroup grp_other_functions
     @{A collection of methods that implement solutions for systems of consistent linear equations. @}
@@ -284,8 +287,6 @@ Contains graph algorithms.
     @defgroup @grp_utilities Utilities
     @ingroup grp_other_functions
 
-    @defgroup grp_cols2vec Columns to Vector
-    @ingroup grp_utility_functions
 
 @defgroup grp_early_stage Early Stage Development
 @brief A collection of implementations which are in early stage of development.

http://git-wip-us.apache.org/repos/asf/madlib/blob/950114cc/src/ports/postgres/modules/utilities/cols2vec.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/cols2vec.py_in b/src/ports/postgres/modules/utilities/cols2vec.py_in
index ced53e9..b38b3d6 100644
--- a/src/ports/postgres/modules/utilities/cols2vec.py_in
+++ b/src/ports/postgres/modules/utilities/cols2vec.py_in
@@ -6,15 +6,17 @@
 """
 
 import plpy
-from utilities.control import MinWarning
-from utilities.utilities import split_quoted_delimited_str
-from utilities.utilities import _string_to_array
-from utilities.utilities import _assert
-from utilities.validate_args import columns_exist_in_table
-from utilities.validate_args import is_var_valid
-from utilities.validate_args import get_cols
-from utilities.validate_args import quote_ident
-from utilities.utilities import py_list_to_sql_string
+from control import MinWarning
+from internal.db_utils import quote_literal
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import _assert
+from utilities import add_postfix
+from validate_args import columns_exist_in_table
+from validate_args import is_var_valid
+from validate_args import get_cols
+from validate_args import quote_ident
+from utilities import py_list_to_sql_string
 
 
 m4_changequote(`<!', `!>')
@@ -31,12 +33,12 @@ def validate_cols2vec_args(source_table, output_table,
         _assert(
             columns_exist_in_table(
                 source_table, split_quoted_delimited_str(list_of_features)),
-            "Invalid columns to list of features {0}".format(list_of_features))
+            "Invalid columns in list_of_features {0}".format(list_of_features))
 
     if cols_to_output and cols_to_output.strip() != '*':
         _assert(
             columns_exist_in_table(
-                source_table, _string_to_array(cols_to_output)),
+                source_table, split_quoted_delimited_str(cols_to_output)),
             "Invalid columns to output list {0}".format(cols_to_output))
 
 
@@ -44,68 +46,64 @@ def cols2vec(schema_madlib, source_table, output_table, list_of_features,
              list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
     """
     Args:
-        @param schema_madlib:               Name of MADlib schema
-        @param model:                       Name of table containing the tree model
-        @param source_table:                Name of table containing prediction data
-        @param output_table:                Name of table to output the results
-        @param list_of_features:            Comma-separated string of column names or
-                                            expressions to put into feature array.
-                                            Can also be a '*' implying all columns
-                                            are to be put into feature array.
+        @param schema_madlib: Name of MADlib schema
+        @param model: Name of table containing the tree model
+        @param source_table: Name of table containing prediction data
+        @param output_table: Name of table to output the results
+        @param list_of_features: Comma-separated string of column names or
+                                 expressions to put into feature array.
+                                 Can also be a '*' implying all columns
+                                 are to be put into feature array.
         @param list_of_features_to_exclude: Comma-separated string of column names
                                             to exclude from the feature array
-        @param cols_to_output:              Comma-separated string of column names
-                                            from the source table to keep in the output table,
-                                            in addition to the feature array.
+        @param cols_to_output: Comma-separated string of column names
+                               from the source table to keep in the output table,
+                               in addition to the feature array.
 
     Returns:
         None
 
     """
-
     with MinWarning('warning'):
         validate_cols2vec_args(source_table, output_table, list_of_features,
-                               list_of_features_to_exclude, cols_to_output, **kwargs)
+                               list_of_features_to_exclude,
+                               cols_to_output, **kwargs)
 
+        all_cols = get_cols(source_table, schema_madlib)
         if list_of_features.strip() == '*':
-            all_cols = get_cols(source_table, schema_madlib)
-            exclude_set = set(split_quoted_delimited_str(
-                list_of_features_to_exclude))
-            feature_list = [col for col in all_cols if col not in exclude_set]
-
+            exclude_set = set(split_quoted_delimited_str(list_of_features_to_exclude))
+            feature_list = [c for c in all_cols if c not in exclude_set]
         else:
             feature_list = split_quoted_delimited_str(list_of_features)
 
-        feature_cols = py_list_to_sql_string(
-            list(feature_list), "text", False)
-        filtered_list_of_features = ",".join(feature_list)
-
-        output_cols = ''
         if cols_to_output:
-            output_cols_list = [', '.join(get_cols(source_table, schema_madlib)) if col == '*' else col
-                                for col in split_quoted_delimited_str(cols_to_output)]
-            output_cols = ', '.join(output_cols_list) + ","
+            additional_cols = (all_cols if cols_to_output == '*' else
+                               split_quoted_delimited_str(cols_to_output))
+            additional_cols_str = ', '.join(additional_cols) + ","
+        else:
+            additional_cols_str = ''
 
+        feature_list_str = py_list_to_sql_string(feature_list, "TEXT[]", True)
         plpy.execute("""
-    		CREATE TABLE {output_table} AS
-    		select {output_cols}
-            array[{filtered_list_of_features}] as feature_vector
-    		from {source_table}
-    		""".format(**locals()))
+            CREATE TABLE {output_table} AS
+                SELECT {additional_cols_str}
+                       {feature_list_str} AS feature_vector
+                FROM {source_table}
+            """.format(**locals()))
 
-        plpy.execute("""
-            CREATE TABLE {output_table}_summary
-            (
-            source_table TEXT,
-            list_of_features TEXT,
-            list_of_features_to_exclude TEXT,
-            feature_cols TEXT[]
-            )""".format(output_table=output_table))
+        feature_cols = py_list_to_sql_string(
+            [quote_literal(f) for f in feature_list], "TEXT", True)
 
+        output_table_summary = add_postfix(output_table, "_summary")
+        # Dollar-quote the text to allow single-quotes without escaping
+        dq = "$__MADLIB_OUTER__$"
+        feature_exclude_str = ("NULL" if not list_of_features_to_exclude else
+                               list_of_features_to_exclude)
         plpy.execute("""
-            INSERT INTO {output_table}_summary
-            VALUES ('{source_table}','{list_of_features}',
-            '{list_of_features_to_exclude}',
-            (SELECT {feature_cols} as feature_names from {source_table} limit 1))
-            """.format( **locals()))
-        return
+            CREATE TABLE {output_table_summary} AS
+            SELECT
+                {dq}{source_table}{dq}::TEXT AS source_table,
+                {dq}{list_of_features}{dq}::TEXT AS list_of_features,
+                {dq}{feature_exclude_str}{dq}::TEXT AS list_of_features_to_exclude,
+                {feature_cols} AS feature_names
+            """.format(**locals()))

http://git-wip-us.apache.org/repos/asf/madlib/blob/950114cc/src/ports/postgres/modules/utilities/cols2vec.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/cols2vec.sql_in b/src/ports/postgres/modules/utilities/cols2vec.sql_in
index 035b992..a70a953 100644
--- a/src/ports/postgres/modules/utilities/cols2vec.sql_in
+++ b/src/ports/postgres/modules/utilities/cols2vec.sql_in
@@ -1,5 +1,5 @@
-/* ----------------------------------------------------------------------- *//**
- *
+/* ----------------------------------------------------------------------- */
+/**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -17,17 +17,16 @@
  * specific language governing permissions and limitations
  * under the License.
  *
- *//* ----------------------------------------------------------------------- */
- 
-/* ----------------------------------------------------------------------- *//**
-*
-*file cols2vec.sql_in
-*
-*brief A set of utilities to ease basic table transformations (such as *aggregating multiple columns in an array)
-*
-*
-*//* ----------------------------------------------------------------------- */
- m4_include(`SQLCommon.m4')
+ *
+ * @file cols2vec.sql_in
+ * @brief A set of utilities to ease basic table transformations
+ *  (such as *aggregating multiple columns in an array)
+ * @date July 2018
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
 
 
 /**
@@ -44,52 +43,59 @@
 </div>
 
 @about
-Convert all feature columns in your table into an array in a single column.
-
-Given a table with varying number of columns, this function will create an output table that will contain the feature columns into an array. A summry table will be created and will the names
-of the features into array so that this process can be reversed using the function
-vec2cols from array_utilities in PDLTools.
+Convert feature columns in a table into an array in a single column.
 
-The columns that need NOT be included in the feature array need to be specified in the exclude_columns field.
+Given a table with a number of feature columns, this function will create an
+output table that contains the feature columns in an array. A summary
+table will also be created that contains the names of the features combined into
+array, so that this process can be reversed using the function vec2cols.
 
 @anchor cols2vec_usage
 @usage
 
 <pre class="syntax">
 cols2vec(
-    source_table ,
-    output_table ,
+    source_table,
+    output_table,
     list_of_features,
-    list_of_features_to_exclude ,
-    cols_to_output  
-) 
+    list_of_features_to_exclude,
+    cols_to_output
+)
 </pre>
 
 \b Arguments
 <dl class="arglist">
 <dt>source_table</dt>
-<dd>TEXT. Name of the table containing the source data.</tt>.
+<dd>TEXT. Name of the table containing the source data.</dd>.
 
 <dt>output_table</dt>
-<dd>TEXT. Name of the generated table containing the output.</tt>
+<dd>TEXT. Name of the generated table containing the output.</dd>
 
 <dt>list_of_features</dt>
-<dd>TEXT. Comma-separated string of column names or expressions to put into feature array. Can also be a '*' implying all columns are to be put into feature array (except for the ones included in the next argument that lists exclusions). Array columns in the source table are not supported in the 'list_of_features'. </tt>
+<dd>TEXT.
+Comma-separated string of column names or expressions to put into feature array.
+Can also be a '*' implying all columns are to be put into feature array (except
+for the ones included in the next argument that lists exclusions). Array columns
+in the source table are not supported in the 'list_of_features' parameter. </dd>
 
-<dt>list_of_features_to_exclude</dt>
-<dd>TEXT. Default NULL. Comma-separated string of column names to exclude from the feature array.  Use only when 'list_of_features' is '*'. </tt>
+<dt>list_of_features_to_exclude (optional)</dt>
+<dd>TEXT. Default NULL.
+Comma-separated string of column names to exclude from the feature array.  Use
+only when 'list_of_features' is '*'. </dd>
 
-<dt>cols_to_output</dt>
-<dd>TEXT. Default NULL. Comma-separated string of column names from the source table to keep in the output table, in addition to the feature array.  To keep all columns from the source table, use '*'. </tt>
-
-</dd>
+<dt>cols_to_output (optional)</dt>
+<dd>TEXT. Default NULL.
+Comma-separated string of column names from the source table to keep in the
+output table, in addition to the feature array.  To keep all columns from the
+source table, use '*' for this parameter. </dd>
 
+</dl>
 
 @anchor cols2vec_example
-@examp
-
-<pre class="syntax">
+@par Examples
 
+-# Load sample data:
+<pre class="example">
 DROP TABLE IF EXISTS cols2vec;
 CREATE TABLE cols2vec (
     id              bigint,
@@ -99,39 +105,26 @@ CREATE TABLE cols2vec (
     feat3           float,
     other_col       float
 );
-
 INSERT INTO cols2vec  VALUES
 (1, 0, 1, 1, 0.5, 0.9),
 (2, 1, 0, 1, 0.3, 0.3),
 (3, 0, 0, 0, 0.1, 1.1),
 (4, 1, 1, 0, 0.9, 0.4);
+</pre>
 
-
-<pre>
-
-<pre class="syntax">
-drop table if exists cols2vec_result;
-drop table if exists cols2vec_result_summary;
-
-
-select cols2vec(
-      'cols2vec',                  -- input table
-      'cols2vec_result',		       -- Output table
-      'feat1,feat2,feat3',         -- Comma Seperated List of Features
-      'id',                        --  Features To Exclude
-      'id,label'	 			           -- Output columns to be included in
-      								                output table
+-# Create feature array:
+<pre class="example">
+DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
+SELECT madlib.cols2vec(
+      'cols2vec',               -- input table
+      'cols2vec_result',        -- output table
+      'feat1,feat2,feat3',      -- list of features
+       NULL,                    -- features to exclude
+      'id, label           '    -- columns from input table to be included in output
 );
-
-select * from cols2vec_result;
+SELECT * FROM cols2vec_result ORDER BY id;
 </pre>
-
-
--# Expected output:
-
 <pre class="result">
-select * from cols2vec_result;
-
  id | label | feature_vector 
 ----+-------+----------------
   1 |     0 | {1,1,0.5}
@@ -139,20 +132,46 @@ select * from cols2vec_result;
   3 |     0 | {0,0,0.1}
   4 |     1 | {1,0,0.9}
 (4 rows)
+</pre>
+View summary table:
+<pre class="example">
+SELECT * FROM cols2vec_result_summary;
+</pre>
+<pre class="result">
+ source_table | list_of_features  | list_of_features_to_exclude |    feature_names    
+--------------+-------------------+-----------------------------+---------------------
+ cols2vec     | feat1,feat2,feat3 | None                        | {feat1,feat2,feat3}
+</pre>
 
-select * from cols2vec_result_summary;
-    feature_names    
----------------------
- {feat1,feat2,feat3}
- {feat1,feat2,feat3}
- {feat1,feat2,feat3}
- {feat1,feat2,feat3}
+-# The above result could be obtained in a similar way using the 'features_to_exclude' parameter:
+<pre class="example">
+DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
+SELECT madlib.cols2vec(
+      'cols2vec',               -- input table
+      'cols2vec_result',        -- output table
+      '*',                      -- list of features
+      'id, label, other_col',   -- features to exclude
+      'id, label'               -- columns from input table to be included in output
+);
+SELECT * FROM cols2vec_result ORDER BY id;
+</pre>
+<pre class="result">
+ id | label | feature_vector 
+----+-------+----------------
+  1 |     0 | {1,1,0.5}
+  2 |     1 | {0,1,0.3}
+  3 |     0 | {0,0,0.1}
+  4 |     1 | {1,0,0.9}
 (4 rows)
-
 </pre>
-
-
-<pre class="syntax">
+View summary table:
+<pre class="example">
+SELECT * FROM cols2vec_result_summary;
+</pre>
+<pre class="result">
+ source_table | list_of_features | list_of_features_to_exclude |    feature_names    
+--------------+------------------+-----------------------------+---------------------
+ cols2vec     | *                | id, label, other_col        | {feat1,feat2,feat3}
 </pre>
 */
 
@@ -162,11 +181,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     output_table VARCHAR,
     list_of_features VARCHAR,
     list_of_features_to_exclude VARCHAR,
-    cols_to_output VARCHAR 
+    cols_to_output VARCHAR
 ) RETURNS void AS $$
-    PythonFunction(cols_vec, cols2vec, cols2vec)
+    PythonFunction(utilities, cols2vec, cols2vec)
 $$ LANGUAGE plpythonu VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');    
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
@@ -175,8 +194,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     list_of_features VARCHAR,
     list_of_features_to_exclude VARCHAR
 ) RETURNS void AS $$
-SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,$4,NULL)
-
+    SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, $4, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -185,7 +203,6 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     output_table VARCHAR,
     list_of_features VARCHAR
 ) RETURNS void AS $$
-SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,NULL,NULL)
-
+    SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

http://git-wip-us.apache.org/repos/asf/madlib/blob/950114cc/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in b/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
index 33fe024..fd22238 100644
--- a/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
+++ b/src/ports/postgres/modules/utilities/test/cols2vec.sql_in
@@ -1,4 +1,5 @@
-/* ----------------------------------------------------------------------- *//**
+/* ----------------------------------------------------------------------- */
+/**
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -17,10 +18,11 @@
  * specific language governing permissions and limitations
  * under the License.
  *
- *//* ----------------------------------------------------------------------- */
+ */
+/* ----------------------------------------------------------------------- */
 
-DROP TABLE IF EXISTS cols2vec;
-CREATE TABLE cols2vec (
+DROP TABLE IF EXISTS source_table;
+CREATE TABLE source_table (
     id              bigint,
     label           int,
     feat1           int,
@@ -28,8 +30,7 @@ CREATE TABLE cols2vec (
     feat3           float,
     other_col       float
 );
-
-INSERT INTO cols2vec  VALUES
+INSERT INTO source_table  VALUES
 (1, 0, 1, 1, 0.5, 0.9),
 (2, 1, 0, 1, 0.3, 0.3),
 (3, 0, 0, 0, 0.1, 1.1),
@@ -37,19 +38,19 @@ INSERT INTO cols2vec  VALUES
 
 
 DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
-
-select cols2vec( 'cols2vec',  'cols2vec_out','feat1,feat2,feat3', 'id', 'id,label' );
-
-select assert(feature_vector = '{1,1,0.5}','Incorrect results for cols2vec') from cols2vec_out where id = 1;
-
-
+SELECT cols2vec( 'source_table',  'cols2vec_out', 'feat1,feat2,feat3', 'id', 'id,label' );
+SELECT assert(feature_vector = '{1,1,0.5}',
+              'Incorrect results for cols2vec')
+FROM cols2vec_out
+WHERE id = 1;
 
 DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+SELECT cols2vec( 'source_table',  'cols2vec_out', '*', 'id,other_col', 'id,label' );
 
-select cols2vec( 'cols2vec',  'cols2vec_out','*', 'id,other_col', 'id,label' );
-
-
-select assert(feature_vector = '{1,1,0.5,0}','Incorrect results for cols2vec') from cols2vec_out where id = 1;
+SELECT * FROM cols2vec_out;
+SELECT assert(feature_vector = '{0,1,1,0.5}','Incorrect results for cols2vec')
+FROM cols2vec_out
+WHERE id = 1;
 
 
 DROP TABLE IF EXISTS cols2vec_spcl;
@@ -61,8 +62,6 @@ CREATE TABLE cols2vec_spcl (
     "fe'%*()at3"           	float,
     other_col       		float,
     "se''x" 				TEXT
-
-
 );
 
 INSERT INTO cols2vec_spcl  VALUES
@@ -75,15 +74,18 @@ INSERT INTO cols2vec_spcl  VALUES
 (7, 0, 1, 0, 0.7, 1.4,'MЖM');
 
 DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
+SELECT cols2vec( 'cols2vec_spcl',  'cols2vec_out','"fe''''at1",feat2,"fe''%*()at3"',
+                NULL, '"i,Ж!#''d","lab$$''''%*Ж!#''''()el"' );
 
-select cols2vec( 'cols2vec_spcl',  'cols2vec_out','"fe''''at1",feat2,"fe''%*()at3"', Null, '"i,Ж!#''d","lab$$''''%*Ж!#''''()el"' );
-
-select assert(feature_vector = '{1,1,0.5}','Incorrect results for cols2vec') from cols2vec_out where "i,Ж!#'d" = 1;
-
+select assert(feature_vector = '{1,1,0.5}','Incorrect results for cols2vec')
+FROM cols2vec_out
+WHERE "i,Ж!#'d" = 1;
 
 DROP TABLE IF EXISTS cols2vec_out, cols2vec_out_summary;
 
-select cols2vec( 'cols2vec_spcl',  'cols2vec_out','*', '"se''''x"', '"i,Ж!#''d","lab$$''''%*Ж!#''''()el"' );
-select assert(feature_vector = '{0.5,1,0,0.9,1,1}','Incorrect results for cols2vec') from cols2vec_out where "i,Ж!#'d" = 1;
-
-
+SELECT cols2vec('cols2vec_spcl',  'cols2vec_out','*', '"se''''x"',
+                '"i,Ж!#''d","lab$$''''%*Ж!#''''()el"');
+SELECT assert(feature_vector = '{1,0,1,1,0.5,0.9}',
+              'Incorrect results for cols2vec')
+FROM cols2vec_out
+WHERE "i,Ж!#'d" = 1;

http://git-wip-us.apache.org/repos/asf/madlib/blob/950114cc/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in
index d571b40..87445a7 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -355,14 +355,14 @@ def py_list_to_sql_string(array, array_type=None, long_format=None):
     if not array:
         return "'{{ }}'::{0}".format(array_type)
     else:
-        quote_delimiter = "$__madlib__$"
+        quote_delimiter = "$__MADLIB_OUTER__$"
         # This is a quote delimiter that can be used in lieu of
         # single quotes and allows the use of single quotes in the
         # string without escaping
         array_str = "ARRAY[ {val} ]" if long_format else "{qd}{{ {val} }}{qd}"
-        return (array_str + "::{type}").format(
+        return (array_str + "::{array_type}").format(
             val=','.join(map(str, array)),
-            type=array_type,
+            array_type=array_type,
             qd=quote_delimiter)
 # ------------------------------------------------------------------------