You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hivemall.apache.org by my...@apache.org on 2019/06/19 10:12:17 UTC
[incubator-hivemall] branch master updated: Fixed the usage of
min-max scaling and zscore
This is an automated email from the ASF dual-hosted git repository.
myui pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-hivemall.git
The following commit(s) were added to refs/heads/master by this push:
new 73a5227 Fixed the usage of min-max scaling and zscore
73a5227 is described below
commit 73a5227e7054ee0530853e8d13c0c27b9ed5f82d
Author: Makoto Yui <my...@apache.org>
AuthorDate: Wed Jun 19 19:12:03 2019 +0900
Fixed the usage of min-max scaling and zscore
---
docs/gitbook/ft_engineering/scaling.md | 95 +++++++++++++---------------------
1 file changed, 35 insertions(+), 60 deletions(-)
diff --git a/docs/gitbook/ft_engineering/scaling.md b/docs/gitbook/ft_engineering/scaling.md
index 00288e8..1ba0f96 100644
--- a/docs/gitbook/ft_engineering/scaling.md
+++ b/docs/gitbook/ft_engineering/scaling.md
@@ -36,11 +36,22 @@ select l2_normalize(array('apple:1.0', 'banana:0.5'))
> ["apple:0.8944272","banana:0.4472136"]
# Min-Max Normalization
-https://en.wikipedia.org/wiki/Feature_scaling#Rescaling
+
+[Min-max normalization](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling) converts values to range `[0.0,1.0]`.
+
+```sql
+select
+ rescale(target, min(target) over (), max(target) over ()) as target
+from
+ e2006tfidf_train
+```
+
+It can also expressed without Windowing function as follows:
+
```sql
select min(target), max(target)
from (
-select target from e2006tfidf_train
+ select target from e2006tfidf_train
-- union all
-- select target from e2006tfidf_test
) t;
@@ -63,28 +74,12 @@ from
```
# Feature scaling by zscore
-https://en.wikipedia.org/wiki/Standard_score
-```sql
-select avg(target), stddev_pop(target)
-from (
-select target from e2006tfidf_train
--- union all
--- select target from e2006tfidf_test
-) t;
-```
-> -3.566241460963296 0.6278076335455348
+Refer [this article](https://en.wikipedia.org/wiki/Standard_score) to get details about Zscore.
```sql
-set hivevar:mean_target=-3.566241460963296;
-set hivevar:stddev_target=0.6278076335455348;
-
-create or replace view e2006tfidf_train_scaled
-as
select
- rowid,
- zscore(target, ${mean_target}, ${stddev_target}) as target,
- features
+ zscore(target, avg(target) over (), stddev_pop(target) over ()) as target
from
e2006tfidf_train;
```
@@ -108,49 +103,29 @@ We can create a normalized table as follows:
```sql
create table train_normalized
as
-WITH fv as (
-select
- rowid,
- extract_feature(feature) as feature,
- extract_weight(feature) as value
-from
- train
- LATERAL VIEW explode(features) exploded AS feature
-),
-stats as (
-select
- feature,
- -- avg(value) as mean, stddev_pop(value) as stddev
- min(value) as min, max(value) as max
-from
- fv
-group by
- feature
+WITH exploded as (
+ select
+ rowid,
+ extract_feature(feature) as feature,
+ extract_weight(feature) as value
+ from
+ train
+ LATERAL VIEW explode(features) exploded AS feature
),
-norm as (
-select
- rowid,
- t1.feature,
- -- zscore(t1.value, t2.mean, t2.stddev) as zscore
- rescale(t1.value, t2.min, t2.max) as minmax
-from
- fv t1 JOIN
- stats t2 ON (t1.feature = t2.feature)
-),
-norm_fv as (
-select
- rowid,
- -- concat(feature, ":", zscore) as feature
- -- concat(feature, ":", minmax) as feature -- Before Hivemall v0.3.2-1
- feature(feature, minmax) as feature -- Hivemall v0.3.2-1 or later
-from
- norm
+scaled as (
+ select
+ rowid,
+ feature,
+ rescale(value, min(value) over (), max(value) over ()) as minmax,
+ zscore(value, avg(value) over (), stddev_pop(value) over ()) as zscore
+ from
+ exploded
)
-select
- rowid,
- collect_list(feature) as features
+select
+ rowid,
+ collect_list(feature(feature, minmax)) as features
from
- norm_fv
+ scaled
group by
rowid
;