You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@madlib.apache.org by ri...@apache.org on 2017/02/02 00:20:00 UTC

[1/2] incubator-madlib git commit: Encode categorical: Allow svec array output

Repository: incubator-madlib
Updated Branches:
  refs/heads/master 08294791f -> 916227178


Encode categorical: Allow svec array output

- Replace input parameter `array_output' with `output_type'
- Add cast to 'madlib.svec' if output_type = 'svec'

Closes #93


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/59a09eeb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/59a09eeb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/59a09eeb

Branch: refs/heads/master
Commit: 59a09eeb4e0bbf4d63238a3e87cea90d020d24c2
Parents: 0829479
Author: Satoshi Nagayasu <sn...@uptime.jp>
Authored: Thu Jan 26 00:54:59 2017 +0900
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 1 16:16:39 2017 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  | 23 ++++++++++++--------
 .../modules/utilities/encode_categorical.sql_in | 10 ++++-----
 .../utilities/test/encode_categorical.sql_in    |  8 +++----
 3 files changed, 23 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/59a09eeb/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 5d7ead4..26361a6 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -60,7 +60,7 @@ class CategoricalEncoder(object):
                  top=None,
                  value_to_drop=None,
                  encode_null=False,
-                 array_output=False,
+                 output_type='column',
                  output_dictionary=False,
                  distributed_by=None,
                  **kwargs):
@@ -74,7 +74,7 @@ class CategoricalEncoder(object):
         self.top = top
         self.value_to_drop = value_to_drop
         self.encode_null = encode_null
-        self.array_output = array_output
+        self.output_type = output_type
         self.output_dictionary = output_dictionary
         self.distributed_by = distributed_by
 
@@ -121,10 +121,15 @@ class CategoricalEncoder(object):
                                     if (c not in self._output_cols and
                                         unquote_ident(c) not in self._output_cols)])
 
-        if self.array_output:
+        if self.output_type == 'array':
             categorical_col_str = ("ARRAY[{0}] AS {1}".
                                    format(categorical_col_str,
                                           self._array_out_name))
+        elif self.output_type == 'svec':
+            categorical_col_str = ("ARRAY[{0}]::float8[]::{1}.svec AS {2}".
+                                   format(categorical_col_str,
+                                          self.schema_madlib,
+                                          self._array_out_name))
         out_sql = """
             CREATE TABLE {out} AS (
                 SELECT
@@ -154,7 +159,7 @@ class CategoricalEncoder(object):
         self._row_id_cols = split_quoted_delimited_str(self.row_id)
 
         # flag to build a dictionary table
-        self._output_dictionary = True if self.array_output else self.output_dictionary
+        self._output_dictionary = True if self.output_type in ['array', 'svec'] else self.output_dictionary
 
         # how to distribute the output table (for distributed platforms)
         if not is_platform_pg():
@@ -295,7 +300,7 @@ class CategoricalEncoder(object):
                 value_str = "= '{v}'".format(v=str(v))
                 v_type = str
 
-            if not self.array_output:
+            if self.output_type not in ('array', 'svec'):
                 # array_output = True implies all the case outputs will be wrapped
                 # as an array, hence not requiring an alias for each case
                 if not self._output_dictionary:
@@ -350,7 +355,7 @@ class CategoricalEncoder(object):
             values = col_to_values[col]
             local_seq = count(1)
             col_no_quotes = strip_end_quotes(col)
-            if self.array_output:
+            if self.output_type != 'column':
                 encoded_col_name = "__encoded_variables__"
                 seq = global_seq
             else:
@@ -549,7 +554,7 @@ def encode_categorical_variables(
         top=None,
         value_to_drop=None,
         encode_null=False,
-        array_output=None,
+        output_type='column',
         output_dictionary=False,
         distributed_by=None,
         **kwargs):
@@ -573,7 +578,7 @@ def encode_categorical_variables(
         encoder = CategoricalEncoder(schema_madlib, source_table, output_table,
                                      categorical_cols, categorical_cols_to_exclude,
                                      row_id, top, value_to_drop, encode_null,
-                                     array_output, output_dictionary,
+                                     output_type, output_dictionary,
                                      distributed_by)
         encoder.build_output_table()
     return None
@@ -639,7 +644,7 @@ SELECT {madlib}.encode_categorical_variables (
         value_to_drop,                  -- (Optional) Reference value to drop for each column
         encode_null,                    -- (Optional) Whether NULL should be treated as one of the
                                         --  values of the categorical variable.
-        array_output,                   -- (Optional) Get all encoded variables in an array
+        output_type,                    -- (Optional) Get all encoded variables in an array
         output_dictionary,              -- (Optional) Simplify output column naming and provide
                                         --  a mapping between simple names and meaning
         distributed_by                  -- (Optional) Columns to use for the distribution policy of

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/59a09eeb/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index f591590..c4151b2 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -79,7 +79,7 @@ encode_categorical_variables (
         top,                            -- Optional
         value_to_drop,                  -- Optional
         encode_null,                    -- Optional
-        array_output,                   -- Optional
+        output_type,                    -- Optional
         output_dictionary,              -- Optional
         distributed_by                  -- Optional
         )
@@ -660,7 +660,7 @@ SELECT madlib.encode_categorical_variables (
  * @param top Parameter to include only top values of a categorical variable
  * @param value_to_drop Parameter to set reference column in dummy coding
  * @param encode_null Boolean to determine the behavior for rows with NULL value
- * @param array_output Boolean to determine if output should be in an array or columns
+ * @param output_type Parameter to set output data type: 'column', 'array' or 'svec'
  * @param output_dictionary Boolean to simplify column naming and with a separate
  *                              mapping table to actual values
  * @param distributed_by Comma-separated list of column names to use for distribution of output
@@ -682,7 +682,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     top                             VARCHAR,
     value_to_drop                   VARCHAR,
     encode_null                     BOOLEAN,
-    array_output                    BOOLEAN,
+    output_type                     VARCHAR,
     output_dictionary               BOOLEAN,
     distributed_by                  VARCHAR
 ) RETURNS VOID AS $$
@@ -701,7 +701,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     top                             VARCHAR,
     value_to_drop                   VARCHAR,
     encode_null                     BOOLEAN,
-    array_output                    BOOLEAN,
+    output_type                     VARCHAR,
     output_dictionary               BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunction(utilities, encode_categorical, encode_categorical_variables)
@@ -720,7 +720,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables(
     top                             VARCHAR,
     value_to_drop                   VARCHAR,
     encode_null                     BOOLEAN,
-    array_output                    BOOLEAN
+    output_type                     VARCHAR
 ) RETURNS VOID AS $$
     PythonFunction(utilities, encode_categorical, encode_categorical_variables)
 $$ LANGUAGE plpythonu

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/59a09eeb/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
index 0c451a4..79cfba9 100644
--- a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
@@ -70,7 +70,7 @@ select * from abalone_out2;
 select madlib.encode_categorical_variables('abalone', 'abalone_out3',
                                            'sex, "Class"', 'class',
                                            'id, sex, "Class"', '2', 'sex=M, Class=1',
-                                           true, false, false
+                                           true, 'column', false
                                            );
 select * from abalone_out3;
 
@@ -78,7 +78,7 @@ select * from abalone_out3;
 select madlib.encode_categorical_variables('abalone', 'abalone_out4',
                                            '*', '"Class"',
                                            'id', '2', 'M',
-                                           true, false, false
+                                           true, 'column', false
                                            );
 select * from abalone_out4;
 
@@ -87,7 +87,7 @@ select * from abalone_out4;
 select madlib.encode_categorical_variables('abalone', 'abalone_out5',
                                            'sex, "Class"', '',
                                            'id', '0.5', 'M',
-                                           true, true, false
+                                           true, 'array', false
                                            );
 select * from abalone_out5;
 select * from abalone_out5_dictionary order by index;
@@ -96,7 +96,7 @@ select * from abalone_out5_dictionary order by index;
 select madlib.encode_categorical_variables('abalone', 'abalone_out6',
                                            'sex, "Class"', '',
                                            'id', '3', 'class=1',
-                                           true, false, true
+                                           true, 'svec', true
                                            );
 select * from abalone_out6;
 select * from abalone_out6_dictionary order by variable, index;

[2/2] incubator-madlib git commit: Encode categorical: Update doc and code comments

Posted by ri...@apache.org.

Encode categorical: Update doc and code comments


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/91622717
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/91622717
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/91622717

Branch: refs/heads/master
Commit: 916227178c60279861dfce2412df57e37a06651a
Parents: 59a09ee
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Feb 1 16:12:46 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 1 16:18:35 2017 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  | 15 ++++++++----
 .../modules/utilities/encode_categorical.sql_in | 24 +++++++++++++-------
 2 files changed, 26 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/91622717/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 26361a6..a92c559 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -159,7 +159,7 @@ class CategoricalEncoder(object):
         self._row_id_cols = split_quoted_delimited_str(self.row_id)
 
         # flag to build a dictionary table
-        self._output_dictionary = True if self.output_type in ['array', 'svec'] else self.output_dictionary
+        self._output_dictionary = True if self.output_type in ('array', 'svec') else self.output_dictionary
 
         # how to distribute the output table (for distributed platforms)
         if not is_platform_pg():
@@ -228,6 +228,9 @@ class CategoricalEncoder(object):
             _assert(is_var_valid(self.source_table, ','.join(self._row_id_cols)),
                     "Encoding categorical: Not all columns from ({0}) present in source table ({1})"
                     .format(self._row_id_cols, self.source_table))
+        _assert(self.output_type in ('column', 'array', 'svec'),
+                "Encoding categorical: Output type should be one of {0}".
+                format(('column', 'array', 'svec')))
     # ------------------------------------------------------------------------------
 
     def _is_col_name_long(self, col_to_values):
@@ -301,8 +304,6 @@ class CategoricalEncoder(object):
                 v_type = str
 
             if self.output_type not in ('array', 'svec'):
-                # array_output = True implies all the case outputs will be wrapped
-                # as an array, hence not requiring an alias for each case
                 if not self._output_dictionary:
                     value_names = {None: 'NULL',
                                    list: self._name_others_col,
@@ -311,6 +312,8 @@ class CategoricalEncoder(object):
                 else:
                     alias = 'AS "{0}_{1}"'.format(col_no_quotes, seq)
             else:
+                # if output_type is array-like then each case does not
+                # require an alias
                 alias = ""
             return ("(CASE WHEN ({col} {value_str}) "
                     "THEN 1 ELSE 0 END)::INTEGER {alias}".
@@ -569,7 +572,8 @@ def encode_categorical_variables(
         @param top: str, Parameter to include only top values of a categorical variable
         @param value_to_drop: str, Parameter to set reference column in dummy coding
         @param encode_null: bool, If True, NULL is treated as a categorical value
-        @param array_output: bool, Parameter to determine if output should be in an array or columns
+        @param output_type: str, Parameter to determine if output should be an array, svec or individual columns
+                                 Can take values ('column', 'array', 'svec')
         @param output_dictionary: bool, If True columns names are simplified and
                     a separate mapping table is created to understand the names
         @param distributed_by: str, Comma-separated list of column names to use for distribution of output
@@ -644,7 +648,8 @@ SELECT {madlib}.encode_categorical_variables (
         value_to_drop,                  -- (Optional) Reference value to drop for each column
         encode_null,                    -- (Optional) Whether NULL should be treated as one of the
                                         --  values of the categorical variable.
-        output_type,                    -- (Optional) Get all encoded variables in an array
+        output_type,                    -- (Optional) Get encoded variables in individual columns
+                                        --    or as an array (Can be 'column', 'array', or 'svec')
         output_dictionary,              -- (Optional) Simplify output column naming and provide
                                         --  a mapping between simple names and meaning
         distributed_by                  -- (Optional) Columns to use for the distribution policy of

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/91622717/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index c4151b2..a14337c 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -159,16 +159,24 @@ encode_categorical_variables (
     all encoded values for that variable will be set to 0.
     </dd>
 
-    <dt>array_output (optional)</dt>
-    <dd>BOOLEAN. default: FALSE.  This parameter controls the output format
-    of the indicator variables. If FALSE, a column is created for each indicator
+    <dt>output_type (optional)</dt>
+    <dd>VARCHAR. default: 'column'.  This parameter controls the output format
+    of the indicator variables. If 'column', a column is created for each indicator
     variable. PostgreSQL limits the number of columns in a table.
     If the total number of indicator columns exceeds the limit, then make this
-    parameter TRUE to combine the indicator columns
-    into an array. The order of the array is the same as specified in 'categorical_cols'.
-    A dictionary will be created when 'array_output' is TRUE to define an index into
-    the array.  The dictionary table will be given the name of the 'output_table'
-    appended by '_dictionary'.
+    parameter either 'array' to combine the indicator columns into an array or
+    'svec' to cast the array output to <em>'madlib.svec'</em> type.
+
+    Since the array output for any single tuple would be sparse
+    (only one non-zero entry for each categorical column), the 'svec' output would
+    be most efficient for storage. The 'array' output is useful if the array is
+    used for post-processing, including concatenating with other non-categorical
+    features.
+
+    The order of the array is the same as specified in 'categorical_cols'.
+    A dictionary will be created when 'output_type' is 'array' or 'svec' to
+    define an index into the array.  The dictionary table will be given the name
+    of the 'output_table' appended by '_dictionary'.
     </dd>
 
     <dt>output_dictionary (optional)</dt>