You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ri...@apache.org on 2017/02/09 01:45:57 UTC
incubator-madlib git commit: Encode categorical: Allow NULL value for
output_type
Repository: incubator-madlib
Updated Branches:
refs/heads/master 8e7c6ebfe -> fcf21a3bd
Encode categorical: Allow NULL value for output_type
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/fcf21a3b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/fcf21a3b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/fcf21a3b
Branch: refs/heads/master
Commit: fcf21a3bd272ada8f3977fbf23a2df66c2fd6f57
Parents: 8e7c6eb
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed Feb 8 17:45:47 2017 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Feb 8 17:45:53 2017 -0800
----------------------------------------------------------------------
.../modules/utilities/encode_categorical.py_in | 24 +++++++++++++++-----
1 file changed, 18 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/fcf21a3b/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index a92c559..54b4add 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -60,7 +60,7 @@ class CategoricalEncoder(object):
top=None,
value_to_drop=None,
encode_null=False,
- output_type='column',
+ output_type=None,
output_dictionary=False,
distributed_by=None,
**kwargs):
@@ -74,7 +74,8 @@ class CategoricalEncoder(object):
self.top = top
self.value_to_drop = value_to_drop
self.encode_null = encode_null
- self.output_type = output_type
+
+ self.output_type = 'column' if not output_type else output_type.lower()
self.output_dictionary = output_dictionary
self.distributed_by = distributed_by
@@ -158,8 +159,22 @@ class CategoricalEncoder(object):
# columns that determine the index for output table
self._row_id_cols = split_quoted_delimited_str(self.row_id)
+ # output type for specific supported types
+ all_output_types = sorted(['array', 'column', 'svec'])
+ try:
+ # allow user to specify a prefix substring of
+ # supported output types. This works because the supported
+ # output types have unique prefixes.
+ self.output_type = next(s for s in all_output_types
+ if s.startswith(self.output_type))
+ except StopIteration:
+ # next() returns a StopIteration if no element found
+ plpy.error("Encoding categorical: Output type should be one of {0}".
+ format(','.join(all_output_types)))
+
# flag to build a dictionary table
- self._output_dictionary = True if self.output_type in ('array', 'svec') else self.output_dictionary
+ self._output_dictionary = (True if self.output_type in ('array', 'svec')
+ else self.output_dictionary)
# how to distribute the output table (for distributed platforms)
if not is_platform_pg():
@@ -228,9 +243,6 @@ class CategoricalEncoder(object):
_assert(is_var_valid(self.source_table, ','.join(self._row_id_cols)),
"Encoding categorical: Not all columns from ({0}) present in source table ({1})"
.format(self._row_id_cols, self.source_table))
- _assert(self.output_type in ('column', 'array', 'svec'),
- "Encoding categorical: Output type should be one of {0}".
- format(('column', 'array', 'svec')))
# ------------------------------------------------------------------------------
def _is_col_name_long(self, col_to_values):