You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Liu Shuo (Jira)" <ji...@apache.org> on 2023/04/19 08:49:00 UTC

[jira] [Created] (SPARK-43182) 3 tables join with limit when AE is enabled and one table is skewed

Liu Shuo created SPARK-43182:
--------------------------------

             Summary: 3 tables join with limit when AE is enabled and one table is skewed
                 Key: SPARK-43182
                 URL: https://issues.apache.org/jira/browse/SPARK-43182
             Project: Spark
          Issue Type: Bug
          Components: SQL
    Affects Versions: 3.4.0
            Reporter: Liu Shuo


When we test AE in Spark3.4.0 with the following case, we find If we disable AE or enable Ae but disable skewJoin, the sql will finish in 20s, but if we enable AE and enable skewJoin,it will take very long time.

The test case:
{code:java}
create table source_aqe(c1 int,c18 string) using csv options(path 'file:///tmp/spark-warehouse/data/');
create table hive_snappy_aqe_table1(c1 int)stored as PARQUET partitioned by(c18 string); 
insert into table hive_snappy_aqe_table1 partition(c18=1)select c1 from source_aqe;
insert into table hive_snappy_aqe_table1 partition(c18=2)select c1 from source_aqe limit 120000;
insert into table hive_snappy_aqe_table1 partition(c18=3)select c1 from source_aqe limit 150000;create table hive_snappy_aqe_table2(c1 int)stored as PARQUET partitioned by(c18 string); 
insert into table hive_snappy_aqe_table2 partition(c18=1)select c1 from source_aqe limit 160000;
insert into table hive_snappy_aqe_table2 partition(c18=2)select c1 from source_aqe limit 120000;create table hive_snappy_aqe_table3(c1 int)stored as PARQUET partitioned by(c18 string); 
insert into table hive_snappy_aqe_table3 partition(c18=1)select c1 from source_aqe limit 160000;
insert into table hive_snappy_aqe_table3 partition(c18=2)select c1 from source_aqe limit 120000;
set spark.sql.adaptive.enabled=false;
set spark.sql.adaptive.forceOptimizeSkewedJoin = false;
set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1;
set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB;
set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB;
set spark.sql.autoBroadcastJoinThreshold = 51200;
 
###it will finish in 20s 
select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10;
set spark.sql.adaptive.enabled=true;
set spark.sql.adaptive.forceOptimizeSkewedJoin = true;
set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1;
set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB;
set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB;
set spark.sql.autoBroadcastJoinThreshold = 51200;
###it will take very long time 
select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10;{code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org