You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "ocean (Jira)" <ji...@apache.org> on 2021/12/09 06:07:00 UTC

[jira] [Comment Edited] (SPARK-37581) sql hang at planning stage

    [ https://issues.apache.org/jira/browse/SPARK-37581?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17456154#comment-17456154 ] 

ocean edited comment on SPARK-37581 at 12/9/21, 6:06 AM:
---------------------------------------------------------

Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 join operators, costs 39s. when 12 join operators, costs 120s. when 13 join operators , it can not finish.  

 

I think, we can debug it at 11 join operators, to find why it is so slow.

=======

drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
group by dt) t2
on calendar.dt = t2.dt


left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
group by dt) t11
on calendar.dt = t11.dt







 


was (Author: oceaneast):
Hi [~hyukjin.kwon]. This sql have 19 join operators.But these join have the same pattern. I found that ,when have 10 join operators, it costs 17s. when 11 join operators, costs 39s. when 12 join operators, costs 120s. when 13 join operators , it can not finish.  

 

I think, we can debug it at 11 join operators, to find why it is so slow.

=======

drop table if exists test.test_c;create table if not exists test.test_c stored as ORCFILE as
select calendar.day,calendar.week,calendar.weekday, a_kbs,
b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,j_kbs,k_kbs
from (select * from test.test_a where dt = '20211126') calendar
left join
(select dt,count(distinct kb_code) as a_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t1
on calendar.dt = t1.dt

left join
(select dt,count(distinct kb_code) as b_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t2
on calendar.dt = t2.dt


left join
(select dt,count(distinct kb_code) as c_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t3
on calendar.dt = t3.dt

left join
(select dt,count(distinct kb_code) as d_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t4
on calendar.dt = t4.dt

left join
(select dt,count(distinct kb_code) as e_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t5
on calendar.dt = t5.dt

left join
(select dt,count(distinct kb_code) as f_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t6
on calendar.dt = t6.dt

left join
(select dt,count(distinct kb_code) as g_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t7
on calendar.dt = t7.dt

left join
(select dt,count(distinct kb_code) as h_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t8
on calendar.dt = t8.dt

left join
(select dt,count(distinct kb_code) as i_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t9
on calendar.dt = t9.dt

left join
(select dt,count(distinct kb_code) as j_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t10
on calendar.dt = t10.dt

left join
(select dt,count(distinct kb_code) as k_kbs
from test.test_b
where dt = '20211126'
and app_id in ('1','2')
and substr(kb_code,1,6) = '666666'
and pageid_merge = 'aaaaa'
group by dt) t11
on calendar.dt = t11.dt







 

> sql hang at planning stage
> --------------------------
>
>                 Key: SPARK-37581
>                 URL: https://issues.apache.org/jira/browse/SPARK-37581
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.1.1, 3.2.0
>            Reporter: ocean
>            Priority: Major
>
> when exec a sql, this sql hang at planning stage.
> when disable DPP, sql can finish normally.
> we can reproduce this  problem through example below:
> create table test.test_a (
> day string,
> week int,
> weekday int)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_a partition (dt=20211126) values('1',1,2);
> create table test.test_b (
> session_id string,
> device_id string,
> brand string,
> model string,
> wx_version string,
> os string,
> net_work_type string,
> app_id string,
> app_name string,
> col_z string,
> page_url string,
> page_title string,
> olabel string,
> otitle string,
> source string,
> send_dt string,
> recv_dt string,
> request_time string,
> write_time string,
> client_ip string,
> col_a string,
> dt_hour varchar(12),
> product string,
> channelfrom string,
> customer_um string,
> kb_code string,
> col_b string,
> rectype string,
> errcode string,
> col_c string,
> pageid_merge string)
> partitioned by (
> dt varchar(8))
> stored as orc;
> insert into test.test_b partition(dt=20211126)
> values('2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2','2');
> create table if not exists test.test_c stored as ORCFILE as
> select calendar.day,calendar.week,calendar.weekday, a_kbs,
> b_kbs, c_kbs,d_kbs,e_kbs,f_kbs,g_kbs,h_kbs,i_kbs,
> j_kbs,k_kbs,l_kbs,m_kbs,n_kbs,o_kbs,p_kbs,q_kbs,r_kbs,s_kbs
> from (select * from test.test_a where dt = '20211126') calendar
> left join
> (select dt,count(distinct kb_code) as a_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t1
> on calendar.dt = t1.dt
> left join
> (select dt,count(distinct kb_code) as b_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t2
> on calendar.dt = t2.dt
> left join
> (select dt,count(distinct kb_code) as c_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t3
> on calendar.dt = t3.dt
> left join
> (select dt,count(distinct kb_code) as d_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t4
> on calendar.dt = t4.dt
> left join
> (select dt,count(distinct kb_code) as e_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t5
> on calendar.dt = t5.dt
> left join
> (select dt,count(distinct kb_code) as f_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t6
> on calendar.dt = t6.dt
> left join
> (select dt,count(distinct kb_code) as g_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t7
> on calendar.dt = t7.dt
> left join
> (select dt,count(distinct kb_code) as h_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t8
> on calendar.dt = t8.dt
> left join
> (select dt,count(distinct kb_code) as i_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t9
> on calendar.dt = t9.dt
> left join
> (select dt,count(distinct kb_code) as j_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t10
> on calendar.dt = t10.dt
> left join
> (select dt,count(distinct kb_code) as k_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t11
> on calendar.dt = t11.dt
> left join
> (select dt,count(distinct kb_code) as l_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t12
> on calendar.dt = t12.dt
> left join
> (select dt,count(distinct kb_code) as m_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t13
> on calendar.dt = t13.dt
> left join
> (select dt,count(distinct kb_code) as n_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t14
> on calendar.dt = t14.dt
> left join
> (select dt,count(distinct kb_code) as o_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t15
> on calendar.dt = t15.dt
> left join
> (select dt,count(distinct kb_code) as p_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t16
> on calendar.dt = t16.dt
> left join
> (select dt,count(distinct kb_code) as q_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t17
> on calendar.dt = t17.dt
> left join
> (select dt,count(distinct kb_code) as r_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t18
> on calendar.dt = t18.dt
> left join
> (select dt,count(distinct kb_code) as s_kbs
> from test.test_b
> where dt = '20211126'
> and app_id in ('1','2')
> and substr(kb_code,1,6) = '666666'
> and pageid_merge = 'aaaaa'
> group by dt) t19
> on calendar.dt = t19.dt;



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org