You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/06/20 07:09:25 UTC
[incubator-hivemall] branch master updated: Fixed a bug in document
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 9f10450 Fixed a bug in document
9f10450 is described below
commit 9f10450b94c3d9249b025bbaf18a0f90a6f43546
Author: Makoto Yui <my...@apache.org>
AuthorDate: Thu Jun 20 16:09:16 2019 +0900
Fixed a bug in document
---
docs/gitbook/ft_engineering/scaling.md | 75 +++++++++-------------------------
1 file changed, 19 insertions(+), 56 deletions(-)
diff --git a/docs/gitbook/ft_engineering/scaling.md b/docs/gitbook/ft_engineering/scaling.md
index 1ba0f96..e8e113e 100644
--- a/docs/gitbook/ft_engineering/scaling.md
+++ b/docs/gitbook/ft_engineering/scaling.md
@@ -89,7 +89,17 @@ from
Apply normalization to the following data.
```sql
-select rowid, features from train limit 3;
+create table train as
+select
+ 1 as rowid, array("weight:69.613","specific_heat:129.07","reflectance:52.111") as features
+UNION ALL
+select
+ 2 as rowid, array("weight:70.67","specific_heat:128.161","reflectance:52.446") as features
+UNION ALL
+select
+ 3 as rowid, array("weight:72.303","specific_heat:128.45","reflectance:52.853") as features
+
+select rowid, features from train;
```
```
@@ -114,10 +124,10 @@ WITH exploded as (
),
scaled as (
select
- rowid,
- feature,
- rescale(value, min(value) over (), max(value) over ()) as minmax,
- zscore(value, avg(value) over (), stddev_pop(value) over ()) as zscore
+ rowid,
+ feature,
+ rescale(value, min(value) over (partition by feature), max(value) over (partition by feature)) as minmax,
+ zscore(value, avg(value) over (partition by feature), stddev_pop(value) over (partition by feature)) as zscore
from
exploded
)
@@ -127,59 +137,12 @@ select
from
scaled
group by
- rowid
-;
+ rowid;
```
```
-1 ["reflectance:0.5252967","specific_heat:0.19863537","weight:0.0"]
-2 ["reflectance:0.5950446","specific_heat:0.09166764","weight:0.052084323"]
-3 ["reflectance:0.6797837","specific_heat:0.12567581","weight:0.13255163"]
-...
-```
-
-# Tips for using both min-max and zscore normalization
-
-```sql
-WITH quantative as (
- select id, true as minmax, "age" as feature, age as value from train
- union all
- select id, false as minmax, "balance" as feature, balance as value from train
- union all
- select id, true as minmax, "day" as feature, day as value from train
- union all
- select id, false as minmax, "duration" as feature, duration as value from train
- union all
- select id, false as minmax, "campaign" as feature, campaign as value from train
- union all
- select id, false as minmax, "pdays" as feature, if(pdays = -1, 0, pdays) as value from train
- union all
- select id, false as minmax, "previous" as feature, previous as value from train
-),
-quantative_stats as (
-select
- feature,
- avg(value) as mean, stddev_pop(value) as stddev,
- min(value) as min, max(value) as max
-from
- quantative
-group by
- feature
-),
-quantative_norm as (
-select
- t1.id,
- collect_list(
- feature(
- t1.feature,
- if(t1.minmax,rescale(t1.value, t2.min, t2.max),zscore(t1.value, t2.mean, t2.stddev))
- )
- ) as features
-from
- quantative t1
- JOIN quantative_stats t2 ON (t1.feature = t2.feature)
-group by
- t1.id
-)
+1 ["reflectance:0.0","specific_heat:1.0","weight:0.0"]
+2 ["reflectance:0.4514809","specific_heat:0.0","weight:0.39293614"]
+3 ["reflectance:1.0","specific_heat:0.31792927","weight:1.0"]
...
```