You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by ok...@apache.org on 2017/08/29 20:42:22 UTC
[45/50] [abbrv] incubator-madlib git commit: DT: Correctly encode
unseen categorical features
DT: Correctly encode unseen categorical features
Changes applied in commit a2f4740 added an option to treat NULL values
as a new category. This was implemented by changing the encoding process
of categorical features to add a new value at the end of the list of
values. The intention with the commit was to treat new unseen, non-null
values equivalent to NULL. The process, however, still encoded the
unseen categorical value as -1, which is interpreted as NULL in
underlying functions. This commit updates this process to correctly use
the last index as the encoding for the unseen/NULL value.
Closes #171
Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/4264fa9d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/4264fa9d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/4264fa9d
Branch: refs/heads/latest_release
Commit: 4264fa9d3bbfdb616364339c127373d7f0568e30
Parents: f1aa9af
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Aug 18 16:59:44 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Aug 18 16:59:44 2017 -0700
----------------------------------------------------------------------
.../recursive_partitioning/feature_encoding.cpp | 22 ++++--
.../recursive_partitioning/decision_tree.py_in | 7 +-
.../recursive_partitioning/decision_tree.sql_in | 73 ++++++++++----------
.../test/decision_tree.sql_in | 2 +-
4 files changed, 59 insertions(+), 45 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/modules/recursive_partitioning/feature_encoding.cpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/feature_encoding.cpp b/src/modules/recursive_partitioning/feature_encoding.cpp
index 3b0a452..a5f131b 100644
--- a/src/modules/recursive_partitioning/feature_encoding.cpp
+++ b/src/modules/recursive_partitioning/feature_encoding.cpp
@@ -184,21 +184,31 @@ map_catlevel_to_int::run(AnyType &args){
ArrayHandle<text*> cat_values = args[0].getAs<ArrayHandle<text*> >();
ArrayHandle<text*> cat_levels = args[1].getAs<ArrayHandle<text*> >();
ArrayHandle<int> n_levels = args[2].getAs<ArrayHandle<int> >();
+ bool null_as_category = args[3].getAs<bool>();
MutableArrayHandle<int> cat_int = allocateArray<int>(n_levels.size());
int pos = 0;
for (size_t i = 0; i < n_levels.size(); i++) {
// linear search to find a match
- int match = -1; // if cat_values contains any not present in cat_levels,
- // then the mapped integer is -1. If cat_values contains
- // a known cat_level, then the mapped integer is
- // the index of that value in cat_levels
- for (int j = 0; j < n_levels[i]; j++)
+
+ // if cat_values contains any not present in cat_levels, then the mapped
+ // integer is -1. If cat_values contains a known cat_level, then the
+ // mapped integer is the index of that value in cat_levels.
+ int match = -1;
+ for (int j = 0; j < n_levels[i]; j++){
if (cmp_text(cat_values[i], cat_levels[pos + j])) {
match = j;
break;
}
- cat_int[i] = match;
+ }
+
+ // If null_as_category is True, then match is set to the last index
+ // instead of -1 since the last index is expected to represent NULL.
+ if (match == -1 and null_as_category){
+ cat_int[i] = n_levels[i] - 1;
+ } else {
+ cat_int[i] = match;
+ }
pos += static_cast<int>(n_levels[i]);
}
return cat_int;
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 35fc3f9..ce56ab0 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -1179,8 +1179,11 @@ def get_feature_str(schema_madlib, boolean_cats,
"(coalesce({0}::text, '{1}'))::text".format(col, null_val))
cat_features_str = ("{0}._map_catlevel_to_int(array[" +
- ", ".join(cat_features_cast) + "], {1}, {2})"
- ).format(schema_madlib, levels_str, n_levels_str)
+ ", ".join(cat_features_cast) + "], {1}, {2}, {3})"
+ ).format(schema_madlib,
+ levels_str,
+ n_levels_str,
+ null_proxy is not None)
else:
cat_features_str = "NULL"
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 8290dcf..3632f2e 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -912,20 +912,20 @@ Result:
(32 rows)
</pre>
-<h4>NULL handling example</h4>
+<h4>NULL Handling Example</h4>
-- Create toy example to illustrate null-as-category handling
+-# Create toy example to illustrate null-as-category handling
<pre class='example'>
-drop table null_handling_example;
-create table null_handling_example (
-id integer,
-country text,
-city text,
-weather text,
-response text
+DROP TABLE IF EXISTS null_handling_example;
+CREATE TABLE null_handling_example (
+ id integer,
+ country text,
+ city text,
+ weather text,
+ response text
);
-insert into null_handling_example values
+INSERT INTO null_handling_example VALUES
(1,null,null,null,'a'),
(2,'US',null,null,'b'),
(3,'US','NY',null,'c'),
@@ -933,22 +933,22 @@ insert into null_handling_example values
DROP TABLE IF EXISTS train_output, train_output_summary;
SELECT madlib.tree_train('null_handling_example', -- source table
- 'train_output', -- output model table
- 'id', -- id column
- 'response', -- dependent variable
- 'country, city, weather', -- features
- NULL,
- 'gini', -- split criterion
- NULL::text, -- no grouping
- NULL::text, -- no weights
- 4, -- max depth
- 1, -- min split
- 1, -- number of bins per continuous variable
- 10, -- number of splits
- NULL, -- pruning parameters
- 'null_as_category=true' -- null handling
+ 'train_output', -- output model table
+ 'id', -- id column
+ 'response', -- dependent variable
+ 'country, weather, city', -- features
+ NULL, -- features to exclude
+ 'gini', -- split criterion
+ NULL::text, -- no grouping
+ NULL::text, -- no weights
+ 4, -- max depth
+ 1, -- min split
+ 1, -- number of bins per continuous variable
+ 10, -- number of splits
+ NULL, -- pruning parameters
+ 'null_as_category=true' -- null handling
);
-select cat_levels_in_text, cat_n_levels from train_output;
+SELECT cat_levels_in_text, cat_n_levels FROM train_output;
</pre>
<pre class='result'>
cat_levels_in_text | cat_n_levels
@@ -956,17 +956,17 @@ select cat_levels_in_text, cat_n_levels from train_output;
{US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2}
</pre>
-- Predict for previously not seen data by assuming NULL value as the default
+-# Predict for previously not seen data by assuming NULL value as the default
<pre class='example'>
-drop table if exists table_test;
-create table table_test (
-id integer,
-country text,
-city text,
-weather text,
-expected_response text
+DROP TABLE IF EXISTS table_test;
+CREATE TABLE table_test (
+ id integer,
+ country text,
+ city text,
+ weather text,
+ expected_response text
);
-insert into table_test values
+INSERT INTO table_test VALUES
(1,'IN','MUM','cloudy','a'),
(2,'US','HOU','humid','b'),
(3,'US','NY','sunny','c'),
@@ -979,7 +979,7 @@ SELECT madlib.tree_predict('train_output',
'response');
SELECT s.id, expected_response, estimated_response
FROM prediction_results p, table_test s
-where s.id = p.id ORDER BY id;
+WHERE s.id = p.id ORDER BY id;
</pre>
<pre class='result'>
id | expected_response | estimated_response
@@ -1229,7 +1229,8 @@ CREATE AGGREGATE MADLIB_SCHEMA._dst_compute_entropy(
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA._map_catlevel_to_int(
cat_values_in_text TEXT[], -- categorical variable value from each row
cat_levels_in_text TEXT[], -- all levels in text
- cat_n_levels INTEGER[] -- number of levels for each categorical variable
+ cat_n_levels INTEGER[], -- number of levels for each categorical variable
+ null_as_category BOOLEAN -- flag to check if NULL is treated as a separate category
) RETURNS INTEGER[] AS
'MODULE_PATHNAME', 'map_catlevel_to_int'
LANGUAGE c IMMUTABLE
http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/4264fa9d/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index 93b9a52..e585379 100644
--- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -43,7 +43,7 @@ SELECT * FROM dummy_splits;
-- cat encoding
SELECT
assert(
- relative_error(_map_catlevel_to_int('{B}', '{A,B}', ARRAY[2]), ARRAY[1]) < 1e-6,
+ relative_error(_map_catlevel_to_int('{B}', '{A,B}', ARRAY[2], TRUE), ARRAY[1]) < 1e-6,
'wrong results in _map_catlevel_to_int()')
;