You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/02/02 00:20:01 UTC
[2/2] incubator-madlib git commit: Encode categorical: Update doc and code comments

Encode categorical: Update doc and code comments


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/91622717
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/91622717
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/91622717

Branch: refs/heads/master
Commit: 916227178c60279861dfce2412df57e37a06651a
Parents: 59a09ee
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Feb 1 16:12:46 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 1 16:18:35 2017 -0800

----------------------------------------------------------------------
 .../modules/utilities/encode_categorical.py_in  | 15 ++++++++----
 .../modules/utilities/encode_categorical.sql_in | 24 +++++++++++++-------
 2 files changed, 26 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/91622717/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 26361a6..a92c559 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -159,7 +159,7 @@ class CategoricalEncoder(object):
         self._row_id_cols = split_quoted_delimited_str(self.row_id)
 
         # flag to build a dictionary table
-        self._output_dictionary = True if self.output_type in ['array', 'svec'] else self.output_dictionary
+        self._output_dictionary = True if self.output_type in ('array', 'svec') else self.output_dictionary
 
         # how to distribute the output table (for distributed platforms)
         if not is_platform_pg():
@@ -228,6 +228,9 @@ class CategoricalEncoder(object):
             _assert(is_var_valid(self.source_table, ','.join(self._row_id_cols)),
                     "Encoding categorical: Not all columns from ({0}) present in source table ({1})"
                     .format(self._row_id_cols, self.source_table))
+        _assert(self.output_type in ('column', 'array', 'svec'),
+                "Encoding categorical: Output type should be one of {0}".
+                format(('column', 'array', 'svec')))
     # ------------------------------------------------------------------------------
 
     def _is_col_name_long(self, col_to_values):
@@ -301,8 +304,6 @@ class CategoricalEncoder(object):
                 v_type = str
 
             if self.output_type not in ('array', 'svec'):
-                # array_output = True implies all the case outputs will be wrapped
-                # as an array, hence not requiring an alias for each case
                 if not self._output_dictionary:
                     value_names = {None: 'NULL',
                                    list: self._name_others_col,
@@ -311,6 +312,8 @@ class CategoricalEncoder(object):
                 else:
                     alias = 'AS "{0}_{1}"'.format(col_no_quotes, seq)
             else:
+                # if output_type is array-like then each case does not
+                # require an alias
                 alias = ""
             return ("(CASE WHEN ({col} {value_str}) "
                     "THEN 1 ELSE 0 END)::INTEGER {alias}".
@@ -569,7 +572,8 @@ def encode_categorical_variables(
         @param top: str, Parameter to include only top values of a categorical variable
         @param value_to_drop: str, Parameter to set reference column in dummy coding
         @param encode_null: bool, If True, NULL is treated as a categorical value
-        @param array_output: bool, Parameter to determine if output should be in an array or columns
+        @param output_type: str, Parameter to determine if output should be an array, svec or individual columns
+                                 Can take values ('column', 'array', 'svec')
         @param output_dictionary: bool, If True columns names are simplified and
                     a separate mapping table is created to understand the names
         @param distributed_by: str, Comma-separated list of column names to use for distribution of output
@@ -644,7 +648,8 @@ SELECT {madlib}.encode_categorical_variables (
         value_to_drop,                  -- (Optional) Reference value to drop for each column
         encode_null,                    -- (Optional) Whether NULL should be treated as one of the
                                         --  values of the categorical variable.
-        output_type,                    -- (Optional) Get all encoded variables in an array
+        output_type,                    -- (Optional) Get encoded variables in individual columns
+                                        --    or as an array (Can be 'column', 'array', or 'svec')
         output_dictionary,              -- (Optional) Simplify output column naming and provide
                                         --  a mapping between simple names and meaning
         distributed_by                  -- (Optional) Columns to use for the distribution policy of

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/91622717/src/ports/postgres/modules/utilities/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
index c4151b2..a14337c 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.sql_in
@@ -159,16 +159,24 @@ encode_categorical_variables (
     all encoded values for that variable will be set to 0.
     </dd>
 
-    <dt>array_output (optional)</dt>
-    <dd>BOOLEAN. default: FALSE.  This parameter controls the output format
-    of the indicator variables. If FALSE, a column is created for each indicator
+    <dt>output_type (optional)</dt>
+    <dd>VARCHAR. default: 'column'.  This parameter controls the output format
+    of the indicator variables. If 'column', a column is created for each indicator
     variable. PostgreSQL limits the number of columns in a table.
     If the total number of indicator columns exceeds the limit, then make this
-    parameter TRUE to combine the indicator columns
-    into an array. The order of the array is the same as specified in 'categorical_cols'.
-    A dictionary will be created when 'array_output' is TRUE to define an index into
-    the array.  The dictionary table will be given the name of the 'output_table'
-    appended by '_dictionary'.
+    parameter either 'array' to combine the indicator columns into an array or
+    'svec' to cast the array output to <em>'madlib.svec'</em> type.
+
+    Since the array output for any single tuple would be sparse
+    (only one non-zero entry for each categorical column), the 'svec' output would
+    be most efficient for storage. The 'array' output is useful if the array is
+    used for post-processing, including concatenating with other non-categorical
+    features.
+
+    The order of the array is the same as specified in 'categorical_cols'.
+    A dictionary will be created when 'output_type' is 'array' or 'svec' to
+    define an index into the array.  The dictionary table will be given the name
+    of the 'output_table' appended by '_dictionary'.
     </dd>
 
     <dt>output_dictionary (optional)</dt>