You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hivemall.apache.org by "Takuya Kitazawa (JIRA)" <ji...@apache.org> on 2017/06/01 03:42:04 UTC
[jira] [Comment Edited] (HIVEMALL-19) Support DIMSUM for approx
all-pairs similarity computation
[ https://issues.apache.org/jira/browse/HIVEMALL-19?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16030877#comment-16030877 ]
Takuya Kitazawa edited comment on HIVEMALL-19 at 6/1/17 3:41 AM:
-----------------------------------------------------------------
assume that we already have a table "transaction" as:
userid itemid purchase_at timestamp
1 31231 2015-04-9 00:29:02
1 13212 2016-05-24 16:29:02
2 312 2016-06-03 23:29:02
3 2313 2016-06-04 19:29:02
{code:title=DIMSUM|theme=FadeToGrey|linenumbers=true|language=sql|firstline=0001|collapse=true}
with agg as (
select
userid,
itemid,
count(1) as purchase_count
from
transaction
group by
userid, itemid
),
c_j as (
select
to_map(j, l2norm) as norms
from (
select
itemid as j,
l2_norm(ln(purchase_count+1)) as l2norm -- UDAF
from
agg
group by
itemid
) t0
),
t1 as (
select
userid as i,
collect_list(
feature(itemid, ln(purchase_count+1))
) as ri -- array(i1:0.1,i2:1.1)
from
agg
group by
userid
),
t2 as (
select
dimsum_mapper(r_i, c_j.norms) -- UDTF
-- dimsum_mapper(r_i, map_get_values_as_list(c_j.l2norm, extract_features_as_list(r_i))
as (j, k, v_jk)
from
t1
left outer join c_j
)
select
j, k,
sum(v_jk) as similarity
from
t2
group by
j, k
;
{code}
was (Author: takuti):
{code:title=DIMSUM|theme=FadeToGrey|linenumbers=true|language=sql|firstline=0001|collapse=true}
WITH c_j as (
select
to_map(j,l2norm) as l2norm
from (
select
user as j,
l2_norm(ln(purchase_count+1))) as l2norm -- UDAF
from
user_purchased
group by
user
) t0
),
t1 as (
select
item as i,
collect_list(
feature(userid, ln(purchase_count+1))
) as ri -- array(u1:0.1,u2:1.1)
from
user_purchased
group by
item
),
t2 as (
select
dimsum_mapper(r_i, c_j.l2norm) -- UDTF
-- dimsum_mapper(r_i, map_get_values_as_list(c_j.l2norm, extract_features_as_list(r_i))
as (j, k, v_jk)
from
t1
CROSS JOIN c_j
)
select
j, k,
sum(v_jk) as similarity
from
t2
group by
j, k
{code}
> Support DIMSUM for approx all-pairs similarity computation
> ----------------------------------------------------------
>
> Key: HIVEMALL-19
> URL: https://issues.apache.org/jira/browse/HIVEMALL-19
> Project: Hivemall
> Issue Type: Sub-task
> Reporter: Makoto Yui
> Assignee: Takuya Kitazawa
> Priority: Minor
>
> Support DIMSUM for approx all-pairs similarity computation.
> https://blog.twitter.com/2014/all-pairs-similarity-via-dimsum
> http://www.jmlr.org/papers/volume14/bosagh-zadeh13a/bosagh-zadeh13a.pdf
> https://github.com/alsoltani/DimSum
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)