You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@doris.apache.org by 姜勇辉 <ji...@fun.tv> on 2022/06/21 02:58:40 UTC

bucket shuffle join导致动态分区表的单分区join时,每次join结果不一样

Bucket Shuffle Join
问题:
当我在使用动态分区表left_table的单个分区join维表right_table时,发现每次join结果不一样。我不知道为什么会产生这种原因。
我进行了一些分析,发现当左表left_table不是分区表时,join结果正常。下面为我的测试用例。
 
测试Bucket Shuffle Join
1.创建left_table和right_table
SQL
#创建左表
create table left_table(
  `data_date` date, 
  `id` int, 
  `name` varchar(256)
)
duplicate key(data_date,id)
PARTITION BY RANGE(data_date)()
distributed by hash(`id`) buckets 10
PROPERTIES
(
    "dynamic_partition.enable" = "true",
    "dynamic_partition.time_unit" = "DAY",
    "dynamic_partition.end" = "3",
    "dynamic_partition.prefix" = "p",
    "dynamic_partition.buckets" = "32",
    "dynamic_partition.create_history_partition" = "true",
    "dynamic_partition.history_partition_num" = "30"
);

#创建右表
CREATE TABLE `right_table` (
  `id` int(11) NULL COMMENT "",
  `city` varchar(500) NULL COMMENT "",
  `name` varchar(256) NULL COMMENT "",
  `sex` varchar(100) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`id`, `city`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`id`) BUCKETS 10
PROPERTIES (
"replication_allocation" = "tag.location.default: 3",
"in_memory" = "false",
"storage_format" = "V2"
);
插入数据
left_table
SQL
insert into left_table values('2022-06-20',1,'a1'),('2022-06-20',1,'a1'),('2022-06-20',1,'a1'),('2022-06-20',2,'a2'),('2022-06-20',2,'a2'),('2022-06-20',2,null),('2022-06-20',null,'a2'),('2022-06-20',null,null),('2022-06-20',3,'a3'),('2022-06-20',3,'a3'),('2022-06-20',4,'a4'),('2022-06-20',4,'a4'),('2022-06-20',5,'a5'),('2022-06-20',6,'a6'),('2022-06-20',7,'a7'),('2022-06-20',8,'a8'),('2022-06-20',9,'a9')
 
right_table
SQL
insert into right_table values(1,'bj','a1','man'),(2,'sh','a2','man'),(3,'wh','a3','man'),(4,'gz','a4','man'),(5,'bj','a5','man'),(6,'tj','a6','man'),(7,'cd','a7','man'),(8,'xm','a8','woman'),(9,'nc','a9','woman')
 
查询结果,每次随机生成数据,join不准确
SQL
select data_date,city,count(*) vv, count(distinct sex) uv from left_table a 
left join right_table b on a.id = b.id
where data_date='2022-06-20'
group by data_date,city;
第一次运行
 
第二次运行
 
第三次运行