You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ji...@apache.org on 2018/07/02 21:54:14 UTC

[3/3] madlib git commit: Encode categorical variables: handling special characters

Encode categorical variables: handling special characters

JIRA: MADLIB-1238
JIRA: MADLIB-1243

This commit deals with special characters in column name and column
values. Also adds install check test cases to cover these scenarios.

Closes #281

Co-Authored-by: Jingyi Mei <jm...@pivotal.io>
Co-Authored-by: Arvind Sridhar <as...@pivotal.io>


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/610ea0b6
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/610ea0b6
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/610ea0b6

Branch: refs/heads/master
Commit: 610ea0b645d0e0baeb7b4cb30ceac942c63e1c87
Parents: fc8498e
Author: Arvind Sridhar <as...@pivotal.io>
Authored: Wed May 23 17:02:43 2018 -0700
Committer: Jingyi Mei <jm...@pivotal.io>
Committed: Mon Jul 2 14:46:34 2018 -0700

----------------------------------------------------------------------
 src/ports/postgres/modules/convex/mlp.sql_in    |  3 +-
 .../modules/utilities/encode_categorical.py_in  |  8 ++---
 .../utilities/test/encode_categorical.sql_in    | 31 ++++++++++++++++++++
 3 files changed, 37 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/610ea0b6/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index c4f8271..13ae4a0 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -104,7 +104,8 @@ mlp_classification(
   </dd>
 
   <dt>independent_varname</dt>
-  <dd>TEXT. Expression list to evaluate for the independent variables.
+  <dd>TEXT. Expression list to evaluate for the independent variables. It should
+  be a numeric array expression.
   If you are using mini-batching, set this parameter to 'independent_varname'
   which is the hardcoded name of the column from the mini-batch preprocessor
   containing the packed independent variables.

http://git-wip-us.apache.org/repos/asf/madlib/blob/610ea0b6/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 0d7eb91..cd08012 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -297,7 +297,7 @@ class CategoricalEncoder(object):
                 # all values collected in a list are to be treated as a single
                 # categorical factor
                 if v:
-                    non_null_v_str = ','.join(["'%s'" % (i) for i in v if i is not None])
+                    non_null_v_str = ','.join(["$__madlib__$%s$__madlib__$" % (i) for i in v if i is not None])
                     if non_null_v_str:
                         value_str = "IN ({0})".format(non_null_v_str)
                         if None in v:
@@ -314,7 +314,7 @@ class CategoricalEncoder(object):
                 cast_str = ''
             else:
                 # assume v is a string if not list/tuple and not None
-                value_str = "= '{v}'".format(v=str(v))
+                value_str = "= $__madlib__${v}$__madlib__$".format(v=str(v))
                 v_type = str
                 cast_str = '::TEXT'
 
@@ -380,7 +380,7 @@ class CategoricalEncoder(object):
             else:
                 encoded_col_name = '"{col_no_quotes}_{seq}"'
                 seq = local_seq
-            insert_template = "('%s', {seq}, '{col}', '{value_str}'::TEXT)" % (encoded_col_name)
+            insert_template = "('%s', {seq}, '{col}', $__madlib__${value_str}$__madlib__$::TEXT)" % (encoded_col_name)
             insert_values = [insert_template.
                              format(col=col,
                                     col_no_quotes=col_no_quotes,
@@ -461,7 +461,7 @@ class CategoricalEncoder(object):
             # get value distribution for each column independently
             top_val_sql_list.append("""
                 SELECT
-                    '{col}' as col_name,
+                    $__madlib__${col}$__madlib__$ as col_name,
                     array_agg(f order by c desc) as value,
                     array_agg(c order by c desc) as freq
                 FROM (

http://git-wip-us.apache.org/repos/asf/madlib/blob/610ea0b6/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
index 31c27aa..7dc6169 100644
--- a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
@@ -100,3 +100,34 @@ select encode_categorical_variables('abalone', 'abalone_out6',
                                            );
 select * from abalone_out6;
 select * from abalone_out6_dictionary order by variable, index;
+
+-- Test special charaters and unicode
+DROP TABLE IF EXISTS abalone_special_char;
+CREATE TABLE abalone_special_char (
+    id serial,
+    "se$$''x" character varying,
+    "len$$'%*()gth" double precision,
+    diameter double precision,
+    height double precision,
+    "ClaЖss" integer
+);
+COPY abalone_special_char ("se$$''x", "len$$'%*()gth", diameter, height, "ClaЖss") FROM stdin WITH DELIMITER '|' NULL as '@';
+F"F|0.475|0.37|0.125|2
+F'F|0.55|0.44|0.15|0
+F$$,'}][{F|0.565|0.44|0.155|2
+MЖM|0.44|0.365|0.125|0
+M@[}(:*;M|0.475|0.37|0.125|2
+M,M|0.47|0.355|0.100|1
+'F'F'|0.55|0.44|0.15|0
+\.
+
+select encode_categorical_variables('abalone_special_char', 'abalone_special_char_out1', '"se$$''''x", "len$$''%*()gth"');
+select * from abalone_special_char_out1;
+
+select encode_categorical_variables('abalone_special_char',
+                                    'abalone_special_char_out2',
+                                    '"se$$''''x", "ClaЖss"', '',
+                                    'id', '3', 'claЖss=1',
+                                     true, 'svec', true);
+select * from abalone_special_char_out2;
+select * from abalone_special_char_out2_dictionary order by variable, index;