You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yuming Wang (Jira)" <ji...@apache.org> on 2022/07/26 09:37:00 UTC

[jira] [Comment Edited] (SPARK-39750) Enable spark.sql.cbo.enabled by default

    [ https://issues.apache.org/jira/browse/SPARK-39750?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17571308#comment-17571308 ] 

Yuming Wang edited comment on SPARK-39750 at 7/26/22 9:36 AM:
--------------------------------------------------------------

A special case, this is part of tpc-ds q47:
{code:sql}
 SELECT
  i_category,
  i_brand,
  sum(ss_sales_price) sum_sales
FROM item, store_sales
WHERE ss_item_sk = i_item_sk
GROUP BY 1, 2;
{code}
set spark.sql.cbo.enabled=false;
{noformat}
Aggregate [i_category#18, i_brand#14], [i_category#18, i_brand#14, MakeDecimal(sum(UnscaledValue(ss_sales_price#40)),17,2) AS sum_sales#116], Statistics(sizeInBytes=412.1 PiB)
+- Project [i_brand#14, i_category#18, ss_sales_price#40], Statistics(sizeInBytes=412.1 PiB)
   +- Join Inner, (ss_item_sk#29 = i_item_sk#6), Statistics(sizeInBytes=471.0 PiB)
      :- Project [i_item_sk#6, i_brand#14, i_category#18], Statistics(sizeInBytes=5.9 MiB)
      :  +- Filter isnotnull(i_item_sk#6), Statistics(sizeInBytes=33.8 MiB)
      :     +- Relation hermes_tpcds5t.item[i_item_sk#6,i_item_id#7,i_rec_start_date#8,i_rec_end_date#9,i_item_desc#10,i_current_price#11,i_wholesale_cost#12,i_brand_id#13,i_brand#14,i_class_id#15,i_class#16,i_category_id#17,i_category#18,i_manufact_id#19,i_manufact#20,i_size#21,i_formulation#22,i_color#23,i_units#24,i_container#25,i_manager_id#26,i_product_name#27] parquet, Statistics(sizeInBytes=33.8 MiB)
      +- Project [ss_item_sk#29, ss_sales_price#40], Statistics(sizeInBytes=79.3 GiB)
         +- Filter isnotnull(ss_item_sk#29), Statistics(sizeInBytes=602.8 GiB)
            +- Relation hermes_tpcds5t.store_sales[ss_sold_time_sk#28,ss_item_sk#29,ss_customer_sk#30,ss_cdemo_sk#31,ss_hdemo_sk#32,ss_addr_sk#33,ss_store_sk#34,ss_promo_sk#35,ss_ticket_number#36L,ss_quantity#37,ss_wholesale_cost#38,ss_list_price#39,ss_sales_price#40,ss_ext_discount_amt#41,ss_ext_sales_price#42,ss_ext_wholesale_cost#43,ss_ext_list_price#44,ss_ext_tax#45,ss_coupon_amt#46,ss_net_paid#47,ss_net_paid_inc_tax#48,ss_net_profit#49,ss_sold_date_sk#50] parquet, Statistics(sizeInBytes=602.8 GiB)
{noformat}
set spark.sql.cbo.enabled=true;
{noformat}
Aggregate [i_category#18, i_brand#14], [i_category#18, i_brand#14, MakeDecimal(sum(UnscaledValue(ss_sales_price#40)),17,2) AS sum_sales#92], Statistics(sizeInBytes=454.8 KiB, rowCount=7.39E+3)
+- Project [i_brand#14, i_category#18, ss_sales_price#40], Statistics(sizeInBytes=811.6 GiB, rowCount=1.38E+10)
   +- Join Inner, (ss_item_sk#29 = i_item_sk#6), Statistics(sizeInBytes=914.7 GiB, rowCount=1.38E+10)
      :- Project [i_item_sk#6, i_brand#14, i_category#18], Statistics(sizeInBytes=20.9 MiB, rowCount=3.72E+5)
      :  +- Filter isnotnull(i_item_sk#6), Statistics(sizeInBytes=151.1 MiB, rowCount=3.72E+5)
      :     +- Relation hermes_tpcds5t.item[i_item_sk#6,i_item_id#7,i_rec_start_date#8,i_rec_end_date#9,i_item_desc#10,i_current_price#11,i_wholesale_cost#12,i_brand_id#13,i_brand#14,i_class_id#15,i_class#16,i_category_id#17,i_category#18,i_manufact_id#19,i_manufact#20,i_size#21,i_formulation#22,i_color#23,i_units#24,i_container#25,i_manager_id#26,i_product_name#27] parquet, Statistics(sizeInBytes=151.1 MiB, rowCount=3.72E+5)
      +- Project [ss_item_sk#29, ss_sales_price#40], Statistics(sizeInBytes=268.2 GiB, rowCount=1.44E+10)
         +- Filter isnotnull(ss_item_sk#29), Statistics(sizeInBytes=2038.5 GiB, rowCount=1.44E+10)
            +- Relation hermes_tpcds5t.store_sales[ss_sold_time_sk#28,ss_item_sk#29,ss_customer_sk#30,ss_cdemo_sk#31,ss_hdemo_sk#32,ss_addr_sk#33,ss_store_sk#34,ss_promo_sk#35,ss_ticket_number#36L,ss_quantity#37,ss_wholesale_cost#38,ss_list_price#39,ss_sales_price#40,ss_ext_discount_amt#41,ss_ext_sales_price#42,ss_ext_wholesale_cost#43,ss_ext_list_price#44,ss_ext_tax#45,ss_coupon_amt#46,ss_net_paid#47,ss_net_paid_inc_tax#48,ss_net_profit#49,ss_sold_date_sk#50] parquet, Statistics(sizeInBytes=2038.5 GiB, rowCount=1.44E+10)
{noformat}
We will plan to sort merge join because the left side size is {{{}20.9 MiB{}}}.

 

Another case:
{code:sql}
select cd_demo_sk, cd_gender, cd_marital_status, cd_dep_count, cd_dep_employed_count, cd_dep_college_count from customer_demographics;
{code}
set spark.sql.cbo.enabled=false;
{noformat}
Project [cd_demo_sk#167, cd_gender#168, cd_marital_status#169, cd_dep_count#173, cd_dep_employed_count#174, cd_dep_college_count#175], Statistics(sizeInBytes=4.4 MiB)
+- Relation hermes_tpcds5t.customer_demographics[cd_demo_sk#167,cd_gender#168,cd_marital_status#169,cd_education_status#170,cd_purchase_estimate#171,cd_credit_rating#172,cd_dep_count#173,cd_dep_employed_count#174,cd_dep_college_count#175] parquet, Statistics(sizeInBytes=7.4 MiB)
{noformat}

set spark.sql.cbo.enabled=true;
{noformat}
Project [cd_demo_sk#167, cd_gender#168, cd_marital_status#169, cd_dep_count#173, cd_dep_employed_count#174, cd_dep_college_count#175], Statistics(sizeInBytes=91.6 MiB, rowCount=1.92E+6)
+- Relation hermes_tpcds5t.customer_demographics[cd_demo_sk#167,cd_gender#168,cd_marital_status#169,cd_education_status#170,cd_purchase_estimate#171,cd_credit_rating#172,cd_dep_count#173,cd_dep_employed_count#174,cd_dep_college_count#175] parquet, Statistics(sizeInBytes=174.0 MiB, rowCount=1.92E+6)
{noformat}



was (Author: q79969786):
A special case, this is part of tpc-ds q47:
{code:sql}
 SELECT
  i_category,
  i_brand,
  sum(ss_sales_price) sum_sales
FROM item, store_sales
WHERE ss_item_sk = i_item_sk
GROUP BY 1, 2;
{code}

set spark.sql.cbo.enabled=false;
{noformat}
Aggregate [i_category#18, i_brand#14], [i_category#18, i_brand#14, MakeDecimal(sum(UnscaledValue(ss_sales_price#40)),17,2) AS sum_sales#116], Statistics(sizeInBytes=412.1 PiB)
+- Project [i_brand#14, i_category#18, ss_sales_price#40], Statistics(sizeInBytes=412.1 PiB)
   +- Join Inner, (ss_item_sk#29 = i_item_sk#6), Statistics(sizeInBytes=471.0 PiB)
      :- Project [i_item_sk#6, i_brand#14, i_category#18], Statistics(sizeInBytes=5.9 MiB)
      :  +- Filter isnotnull(i_item_sk#6), Statistics(sizeInBytes=33.8 MiB)
      :     +- Relation hermes_tpcds5t.item[i_item_sk#6,i_item_id#7,i_rec_start_date#8,i_rec_end_date#9,i_item_desc#10,i_current_price#11,i_wholesale_cost#12,i_brand_id#13,i_brand#14,i_class_id#15,i_class#16,i_category_id#17,i_category#18,i_manufact_id#19,i_manufact#20,i_size#21,i_formulation#22,i_color#23,i_units#24,i_container#25,i_manager_id#26,i_product_name#27] parquet, Statistics(sizeInBytes=33.8 MiB)
      +- Project [ss_item_sk#29, ss_sales_price#40], Statistics(sizeInBytes=79.3 GiB)
         +- Filter isnotnull(ss_item_sk#29), Statistics(sizeInBytes=602.8 GiB)
            +- Relation hermes_tpcds5t.store_sales[ss_sold_time_sk#28,ss_item_sk#29,ss_customer_sk#30,ss_cdemo_sk#31,ss_hdemo_sk#32,ss_addr_sk#33,ss_store_sk#34,ss_promo_sk#35,ss_ticket_number#36L,ss_quantity#37,ss_wholesale_cost#38,ss_list_price#39,ss_sales_price#40,ss_ext_discount_amt#41,ss_ext_sales_price#42,ss_ext_wholesale_cost#43,ss_ext_list_price#44,ss_ext_tax#45,ss_coupon_amt#46,ss_net_paid#47,ss_net_paid_inc_tax#48,ss_net_profit#49,ss_sold_date_sk#50] parquet, Statistics(sizeInBytes=602.8 GiB)
{noformat}

set spark.sql.cbo.enabled=true;
{noformat}
Aggregate [i_category#18, i_brand#14], [i_category#18, i_brand#14, MakeDecimal(sum(UnscaledValue(ss_sales_price#40)),17,2) AS sum_sales#92], Statistics(sizeInBytes=454.8 KiB, rowCount=7.39E+3)
+- Project [i_brand#14, i_category#18, ss_sales_price#40], Statistics(sizeInBytes=811.6 GiB, rowCount=1.38E+10)
   +- Join Inner, (ss_item_sk#29 = i_item_sk#6), Statistics(sizeInBytes=914.7 GiB, rowCount=1.38E+10)
      :- Project [i_item_sk#6, i_brand#14, i_category#18], Statistics(sizeInBytes=20.9 MiB, rowCount=3.72E+5)
      :  +- Filter isnotnull(i_item_sk#6), Statistics(sizeInBytes=151.1 MiB, rowCount=3.72E+5)
      :     +- Relation hermes_tpcds5t.item[i_item_sk#6,i_item_id#7,i_rec_start_date#8,i_rec_end_date#9,i_item_desc#10,i_current_price#11,i_wholesale_cost#12,i_brand_id#13,i_brand#14,i_class_id#15,i_class#16,i_category_id#17,i_category#18,i_manufact_id#19,i_manufact#20,i_size#21,i_formulation#22,i_color#23,i_units#24,i_container#25,i_manager_id#26,i_product_name#27] parquet, Statistics(sizeInBytes=151.1 MiB, rowCount=3.72E+5)
      +- Project [ss_item_sk#29, ss_sales_price#40], Statistics(sizeInBytes=268.2 GiB, rowCount=1.44E+10)
         +- Filter isnotnull(ss_item_sk#29), Statistics(sizeInBytes=2038.5 GiB, rowCount=1.44E+10)
            +- Relation hermes_tpcds5t.store_sales[ss_sold_time_sk#28,ss_item_sk#29,ss_customer_sk#30,ss_cdemo_sk#31,ss_hdemo_sk#32,ss_addr_sk#33,ss_store_sk#34,ss_promo_sk#35,ss_ticket_number#36L,ss_quantity#37,ss_wholesale_cost#38,ss_list_price#39,ss_sales_price#40,ss_ext_discount_amt#41,ss_ext_sales_price#42,ss_ext_wholesale_cost#43,ss_ext_list_price#44,ss_ext_tax#45,ss_coupon_amt#46,ss_net_paid#47,ss_net_paid_inc_tax#48,ss_net_profit#49,ss_sold_date_sk#50] parquet, Statistics(sizeInBytes=2038.5 GiB, rowCount=1.44E+10)
{noformat}

We will plan to sort merge join because the left side size is {{20.9 MiB}}.


> Enable spark.sql.cbo.enabled by default
> ---------------------------------------
>
>                 Key: SPARK-39750
>                 URL: https://issues.apache.org/jira/browse/SPARK-39750
>             Project: Spark
>          Issue Type: Improvement
>          Components: SQL
>    Affects Versions: 3.4.0
>            Reporter: Yuming Wang
>            Priority: Major
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org