You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ji...@apache.org on 2018/07/02 21:54:14 UTC
[3/3] madlib git commit: Encode categorical variables: handling
special characters
Encode categorical variables: handling special characters
JIRA: MADLIB-1238
JIRA: MADLIB-1243
This commit deals with special characters in column name and column
values. Also adds install check test cases to cover these scenarios.
Closes #281
Co-Authored-by: Jingyi Mei <jm...@pivotal.io>
Co-Authored-by: Arvind Sridhar <as...@pivotal.io>
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/610ea0b6
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/610ea0b6
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/610ea0b6
Branch: refs/heads/master
Commit: 610ea0b645d0e0baeb7b4cb30ceac942c63e1c87
Parents: fc8498e
Author: Arvind Sridhar <as...@pivotal.io>
Authored: Wed May 23 17:02:43 2018 -0700
Committer: Jingyi Mei <jm...@pivotal.io>
Committed: Mon Jul 2 14:46:34 2018 -0700
----------------------------------------------------------------------
src/ports/postgres/modules/convex/mlp.sql_in | 3 +-
.../modules/utilities/encode_categorical.py_in | 8 ++---
.../utilities/test/encode_categorical.sql_in | 31 ++++++++++++++++++++
3 files changed, 37 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/610ea0b6/src/ports/postgres/modules/convex/mlp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp.sql_in b/src/ports/postgres/modules/convex/mlp.sql_in
index c4f8271..13ae4a0 100644
--- a/src/ports/postgres/modules/convex/mlp.sql_in
+++ b/src/ports/postgres/modules/convex/mlp.sql_in
@@ -104,7 +104,8 @@ mlp_classification(
</dd>
<dt>independent_varname</dt>
- <dd>TEXT. Expression list to evaluate for the independent variables.
+ <dd>TEXT. Expression list to evaluate for the independent variables. It should
+ be a numeric array expression.
If you are using mini-batching, set this parameter to 'independent_varname'
which is the hardcoded name of the column from the mini-batch preprocessor
containing the packed independent variables.
http://git-wip-us.apache.org/repos/asf/madlib/blob/610ea0b6/src/ports/postgres/modules/utilities/encode_categorical.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/encode_categorical.py_in b/src/ports/postgres/modules/utilities/encode_categorical.py_in
index 0d7eb91..cd08012 100644
--- a/src/ports/postgres/modules/utilities/encode_categorical.py_in
+++ b/src/ports/postgres/modules/utilities/encode_categorical.py_in
@@ -297,7 +297,7 @@ class CategoricalEncoder(object):
# all values collected in a list are to be treated as a single
# categorical factor
if v:
- non_null_v_str = ','.join(["'%s'" % (i) for i in v if i is not None])
+ non_null_v_str = ','.join(["$__madlib__$%s$__madlib__$" % (i) for i in v if i is not None])
if non_null_v_str:
value_str = "IN ({0})".format(non_null_v_str)
if None in v:
@@ -314,7 +314,7 @@ class CategoricalEncoder(object):
cast_str = ''
else:
# assume v is a string if not list/tuple and not None
- value_str = "= '{v}'".format(v=str(v))
+ value_str = "= $__madlib__${v}$__madlib__$".format(v=str(v))
v_type = str
cast_str = '::TEXT'
@@ -380,7 +380,7 @@ class CategoricalEncoder(object):
else:
encoded_col_name = '"{col_no_quotes}_{seq}"'
seq = local_seq
- insert_template = "('%s', {seq}, '{col}', '{value_str}'::TEXT)" % (encoded_col_name)
+ insert_template = "('%s', {seq}, '{col}', $__madlib__${value_str}$__madlib__$::TEXT)" % (encoded_col_name)
insert_values = [insert_template.
format(col=col,
col_no_quotes=col_no_quotes,
@@ -461,7 +461,7 @@ class CategoricalEncoder(object):
# get value distribution for each column independently
top_val_sql_list.append("""
SELECT
- '{col}' as col_name,
+ $__madlib__${col}$__madlib__$ as col_name,
array_agg(f order by c desc) as value,
array_agg(c order by c desc) as freq
FROM (
http://git-wip-us.apache.org/repos/asf/madlib/blob/610ea0b6/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
index 31c27aa..7dc6169 100644
--- a/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
+++ b/src/ports/postgres/modules/utilities/test/encode_categorical.sql_in
@@ -100,3 +100,34 @@ select encode_categorical_variables('abalone', 'abalone_out6',
);
select * from abalone_out6;
select * from abalone_out6_dictionary order by variable, index;
+
+-- Test special charaters and unicode
+DROP TABLE IF EXISTS abalone_special_char;
+CREATE TABLE abalone_special_char (
+ id serial,
+ "se$$''x" character varying,
+ "len$$'%*()gth" double precision,
+ diameter double precision,
+ height double precision,
+ "ClaЖss" integer
+);
+COPY abalone_special_char ("se$$''x", "len$$'%*()gth", diameter, height, "ClaЖss") FROM stdin WITH DELIMITER '|' NULL as '@';
+F"F|0.475|0.37|0.125|2
+F'F|0.55|0.44|0.15|0
+F$$,'}][{F|0.565|0.44|0.155|2
+MЖM|0.44|0.365|0.125|0
+M@[}(:*;M|0.475|0.37|0.125|2
+M,M|0.47|0.355|0.100|1
+'F'F'|0.55|0.44|0.15|0
+\.
+
+select encode_categorical_variables('abalone_special_char', 'abalone_special_char_out1', '"se$$''''x", "len$$''%*()gth"');
+select * from abalone_special_char_out1;
+
+select encode_categorical_variables('abalone_special_char',
+ 'abalone_special_char_out2',
+ '"se$$''''x", "ClaЖss"', '',
+ 'id', '3', 'claЖss=1',
+ true, 'svec', true);
+select * from abalone_special_char_out2;
+select * from abalone_special_char_out2_dictionary order by variable, index;