You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "JESSE CHEN (JIRA)" <ji...@apache.org> on 2016/11/16 00:04:58 UTC
[jira] [Created] (SPARK-18458) core dumped running Spark SQL on
large data volume (100TB)
JESSE CHEN created SPARK-18458:
----------------------------------
Summary: core dumped running Spark SQL on large data volume (100TB)
Key: SPARK-18458
URL: https://issues.apache.org/jira/browse/SPARK-18458
Project: Spark
Issue Type: Bug
Components: SQL
Affects Versions: 1.6.0
Reporter: JESSE CHEN
Fix For: 2.0.0
Testing Spark SQL using TPC queries. Query 49 returns wrong results compared to official result set. This is at 1GB SF (validation run).
SparkSQL has right answer but in wrong order (and there is an 'order by' in the query).
Actual results:
{noformat}
store,9797,0.80000000000000000000,2,2]
[store,12641,0.81609195402298850575,3,3]
[store,6661,0.92207792207792207792,7,7]
[store,13013,0.94202898550724637681,8,8]
[store,9029,1.00000000000000000000,10,10]
[web,15597,0.66197183098591549296,3,3]
[store,14925,0.96470588235294117647,9,9]
[store,4063,1.00000000000000000000,10,10]
[catalog,8929,0.76250000000000000000,7,7]
[store,11589,0.82653061224489795918,6,6]
[store,1171,0.82417582417582417582,5,5]
[store,9471,0.77500000000000000000,1,1]
[catalog,12577,0.65591397849462365591,3,3]
[web,97,0.90361445783132530120,9,8]
[web,85,0.85714285714285714286,8,7]
[catalog,361,0.74647887323943661972,5,5]
[web,2915,0.69863013698630136986,4,4]
[web,117,0.92500000000000000000,10,9]
[catalog,9295,0.77894736842105263158,9,9]
[web,3305,0.73750000000000000000,6,16]
[catalog,16215,0.79069767441860465116,10,10]
[web,7539,0.59000000000000000000,1,1]
[catalog,17543,0.57142857142857142857,1,1]
[catalog,3411,0.71641791044776119403,4,4]
[web,11933,0.71717171717171717172,5,5]
[catalog,14513,0.63541666666666666667,2,2]
[store,15839,0.81632653061224489796,4,4]
[web,3337,0.62650602409638554217,2,2]
[web,5299,0.92708333333333333333,11,10]
[catalog,8189,0.74698795180722891566,6,6]
[catalog,14869,0.77173913043478260870,8,8]
[web,483,0.80000000000000000000,7,6]
{noformat}
Expected results:
{noformat}
+---------+-------+--------------------+-------------+---------------+
| CHANNEL | ITEM | RETURN_RATIO | RETURN_RANK | CURRENCY_RANK |
+---------+-------+--------------------+-------------+---------------+
| catalog | 17543 | .5714285714285714 | 1 | 1 |
| catalog | 14513 | .6354166666666666 | 2 | 2 |
| catalog | 12577 | .6559139784946236 | 3 | 3 |
| catalog | 3411 | .7164179104477611 | 4 | 4 |
| catalog | 361 | .7464788732394366 | 5 | 5 |
| catalog | 8189 | .7469879518072289 | 6 | 6 |
| catalog | 8929 | .7625000000000000 | 7 | 7 |
| catalog | 14869 | .7717391304347826 | 8 | 8 |
| catalog | 9295 | .7789473684210526 | 9 | 9 |
| catalog | 16215 | .7906976744186046 | 10 | 10 |
| store | 9471 | .7750000000000000 | 1 | 1 |
| store | 9797 | .8000000000000000 | 2 | 2 |
| store | 12641 | .8160919540229885 | 3 | 3 |
| store | 15839 | .8163265306122448 | 4 | 4 |
| store | 1171 | .8241758241758241 | 5 | 5 |
| store | 11589 | .8265306122448979 | 6 | 6 |
| store | 6661 | .9220779220779220 | 7 | 7 |
| store | 13013 | .9420289855072463 | 8 | 8 |
| store | 14925 | .9647058823529411 | 9 | 9 |
| store | 4063 | 1.0000000000000000 | 10 | 10 |
| store | 9029 | 1.0000000000000000 | 10 | 10 |
| web | 7539 | .5900000000000000 | 1 | 1 |
| web | 3337 | .6265060240963855 | 2 | 2 |
| web | 15597 | .6619718309859154 | 3 | 3 |
| web | 2915 | .6986301369863013 | 4 | 4 |
| web | 11933 | .7171717171717171 | 5 | 5 |
| web | 3305 | .7375000000000000 | 6 | 16 |
| web | 483 | .8000000000000000 | 7 | 6 |
| web | 85 | .8571428571428571 | 8 | 7 |
| web | 97 | .9036144578313253 | 9 | 8 |
| web | 117 | .9250000000000000 | 10 | 9 |
| web | 5299 | .9270833333333333 | 11 | 10 |
+---------+-------+--------------------+-------------+---------------+
{noformat}
Query used:
{noformat}
-- start query 49 in stream 0 using template query49.tpl and seed QUALIFICATION
select
'web' as channel
,web.item
,web.return_ratio
,web.return_rank
,web.currency_rank
from (
select
item
,return_ratio
,currency_ratio
,rank() over (order by return_ratio) as return_rank
,rank() over (order by currency_ratio) as currency_rank
from
( select ws.ws_item_sk as item
,(cast(sum(coalesce(wr.wr_return_quantity,0)) as decimal(15,4))/
cast(sum(coalesce(ws.ws_quantity,0)) as decimal(15,4) )) as return_ratio
,(cast(sum(coalesce(wr.wr_return_amt,0)) as decimal(15,4))/
cast(sum(coalesce(ws.ws_net_paid,0)) as decimal(15,4) )) as currency_ratio
from
web_sales ws left outer join web_returns wr
on (ws.ws_order_number = wr.wr_order_number and
ws.ws_item_sk = wr.wr_item_sk)
,date_dim
where
wr.wr_return_amt > 10000
and ws.ws_net_profit > 1
and ws.ws_net_paid > 0
and ws.ws_quantity > 0
and ws_sold_date_sk = d_date_sk
and d_year = 2001
and d_moy = 12
group by ws.ws_item_sk
) in_web
) web
where
(
web.return_rank <= 10
or
web.currency_rank <= 10
)
union
select
'catalog' as channel
,catalog.item
,catalog.return_ratio
,catalog.return_rank
,catalog.currency_rank
from (
select
item
,return_ratio
,currency_ratio
,rank() over (order by return_ratio) as return_rank
,rank() over (order by currency_ratio) as currency_rank
from
( select
cs.cs_item_sk as item
,(cast(sum(coalesce(cr.cr_return_quantity,0)) as decimal(15,4))/
cast(sum(coalesce(cs.cs_quantity,0)) as decimal(15,4) )) as return_ratio
,(cast(sum(coalesce(cr.cr_return_amount,0)) as decimal(15,4))/
cast(sum(coalesce(cs.cs_net_paid,0)) as decimal(15,4) )) as currency_ratio
from
catalog_sales cs left outer join catalog_returns cr
on (cs.cs_order_number = cr.cr_order_number and
cs.cs_item_sk = cr.cr_item_sk)
,date_dim
where
cr.cr_return_amount > 10000
and cs.cs_net_profit > 1
and cs.cs_net_paid > 0
and cs.cs_quantity > 0
and cs_sold_date_sk = d_date_sk
and d_year = 2001
and d_moy = 12
group by cs.cs_item_sk
) in_cat
) catalog
where
(
catalog.return_rank <= 10
or
catalog.currency_rank <=10
)
union
select
'store' as channel
,store.item
,store.return_ratio
,store.return_rank
,store.currency_rank
from (
select
item
,return_ratio
,currency_ratio
,rank() over (order by return_ratio) as return_rank
,rank() over (order by currency_ratio) as currency_rank
from
( select sts.ss_item_sk as item
,(cast(sum(coalesce(sr.sr_return_quantity,0)) as decimal(15,4))/cast(sum(coalesce(sts.ss_quantity,0)) as decimal(15,4) )) as return_ratio
,(cast(sum(coalesce(sr.sr_return_amt,0)) as decimal(15,4))/cast(sum(coalesce(sts.ss_net_paid,0)) as decimal(15,4) )) as currency_ratio
from
store_sales sts left outer join store_returns sr
on (sts.ss_ticket_number = sr.sr_ticket_number and sts.ss_item_sk = sr.sr_item_sk)
,date_dim
where
sr.sr_return_amt > 10000
and sts.ss_net_profit > 1
and sts.ss_net_paid > 0
and sts.ss_quantity > 0
and ss_sold_date_sk = d_date_sk
and d_year = 2001
and d_moy = 12
group by sts.ss_item_sk
) in_store
) store
where (
store.return_rank <= 10
or
store.currency_rank <= 10
)
order by 1,4,5
limit 100;
-- end query 49 in stream 0 using template query49.tpl
{noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org