You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hive.apache.org by "zengxl (Jira)" <ji...@apache.org> on 2021/10/14 06:35:00 UTC
[jira] [Updated] (HIVE-25614) Mapjoin then join left,the result is
incorrect
[ https://issues.apache.org/jira/browse/HIVE-25614?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
zengxl updated HIVE-25614:
--------------------------
Description:
Currently I join 3 tables, find the result of left join is *{color:#de350b}null{color}*
Here is my SQL,The result of this SQL is NULL
{code:java}
//代码占位符
CREATE TABLE `pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl`(
`tbl_id` bigint COMMENT 'TBL_ID',
`tbl_create_time` bigint COMMENT 'TBL_CREATE_TIME',
`db_id` bigint COMMENT 'DB_ID',
`tbl_last_access_time` bigint COMMENT 'TBL_LAST_ACCESS_TIME',
`owner` string COMMENT 'OWNER',
`retention` bigint COMMENT 'RETENTION',
`sd_id` bigint COMMENT 'SD_ID',
`tbl_name` string COMMENT 'TBL_NAME',
`tbl_type` string COMMENT 'TBL_TYPE',
`view_expanded_text` string COMMENT 'VIEW_EXPANDED_TEXT',
`view_original_text` string COMMENT 'VIEW_ORIGINAL_TEXT',
`is_rewrite_enabled` bigint COMMENT 'IS_REWRITE_ENABLED',
`tbl_owner_type` string COMMENT 'TBL_OWNER_TYPE',
`cd_id` bigint COMMENT 'CD_ID',
`input_format` string COMMENT 'INPUT_FORMAT',
`is_compressed` bigint COMMENT 'IS_COMPRESSED',
`is_storedassubdirectories` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES',
`tbl_or_part_location` string COMMENT 'tbl_or_part_location',
`num_buckets` bigint COMMENT 'NUM_BUCKETS',
`output_format` string COMMENT 'OUTPUT_FORMAT',
`serde_id` bigint COMMENT 'SERDE_ID',
`part_id` bigint COMMENT 'PART_ID',
`part_create_time` bigint COMMENT 'PART_CREATE_TIME',
`part_last_access_time` bigint COMMENT 'PART_LAST_ACCESS_TIME',
`part_name` string COMMENT 'PART_NAME')
PARTITIONED BY (
`pt` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
with tmp1 as (
select
part_id,
create_time,
last_access_time,
part_name,
sd_id,
tbl_id
from
pods.pods_pf_hive_ah3_metastore_partitions_d
where
pt='2021-08-12'
),
tmp2 as (
select
tbl_id,
create_time,
db_id,
last_access_time,
owner,
retention,
sd_id,
tbl_name,
tbl_type,
view_expanded_text,
view_original_text,
is_rewrite_enabled,
owner_type
from
pods.pods_pf_hive_ah3_metastore_tbls_d
where
pt='2021-08-12'
),
tmp3 as (
select
sd_id,
cd_id,
input_format,
is_compressed,
is_storedassubdirectories,
location,
num_buckets,
output_format,
serde_id
from
pods.pods_pf_hive_ah3_metastore_sds_d
where
pt='2021-08-12'
)insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl PARTITION(pt='2021-08-14')
select
a.tbl_id,
b.create_time as tbl_create_time,
b.db_id,
b.last_access_time as tbl_last_access_time,
b.owner,
b.retention,
a.sd_id,
b.tbl_name,
b.tbl_type,
b.view_expanded_text,
b.view_original_text,
b.is_rewrite_enabled,
b.owner_type as tbl_owner_type,
d.cd_id,
d.input_format,
d.is_compressed,
d.is_storedassubdirectories,
d.location as tbl_location,
d.num_buckets,
d.output_format,
d.serde_id,
a.part_id,
a.create_time as part_create_time,
a.last_access_time as part_last_access_time,
a.part_name
from tmp1 a
left join tmp2 b on a.tbl_id=b.tbl_id
left join tmp3 d on a.sd_id=d.sd_id;
{code}
pods.pods_pf_hive_ah3_metastore_partitions_d、pods.pods_pf_hive_ah3_metastore_tbls_d、pods.pods_pf_hive_ah3_metastore_sds_d from {color:#de350b}*Metastore*{color} partitions、tbls、sds
The sizes of the three tables are as follows:
80.3 M 240.9 M hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_partitions_d/pt=2021-10-09/exchangis_hive_w__2585cbd4_8bf8_4fbb_8a90_f5a7939b62b3.snappy
179.8 K 539.5 K hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_tbls_d/pt=2021-10-09/exchangis_hive_w__8a62acaa_6f82_442e_97db_ce960833612f.snappy
94.3 M 282.9 M hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_sds_d/pt=2021-10-09/exchangis_hive_w__d25536e8_7018_4262_a00e_5af3b1f88925.snappy
The result is as follows,select from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl table *{color:#de350b}is null{color}*
{code:java}
hive> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl where pt='2021-08-14' and sd_id=21229815;
OK
721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER NULL NULL NULL NULL NULL NULL NULL NULL 20302818 1628697610 0 pt=2021-08-11 2021-08-14
{code}
The reality pods.pods_pf_hive_ah3_metastore_sds_d table is that the data in this table is {color:#de350b}*not null*{color}
{code:java}
> select * from pods.pods_pf_hive_ah3_metastore_sds_d where pt='2021-08-12' and sd_id=21229815;
OK
Interrupting...
Be patient, this might take some time.
Press Ctrl+C again to kill JVM
21229815 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 2021-08-12
{code}
When I change the order of join,The following SQL:
{code:java}
with tmp1 as (
select
part_id,
create_time,
last_access_time,
part_name,
sd_id,
tbl_id
from
pods.pods_pf_hive_ah3_metastore_partitions_d
where
pt='2021-08-12'
),
tmp2 as (
select
tbl_id,
create_time,
db_id,
last_access_time,
owner,
retention,
sd_id,
tbl_name,
tbl_type,
view_expanded_text,
view_original_text,
is_rewrite_enabled,
owner_type
from
pods.pods_pf_hive_ah3_metastore_tbls_d
where
pt='2021-08-12'
),
tmp3 as (
select
sd_id,
cd_id,
input_format,
is_compressed,
is_storedassubdirectories,
location,
num_buckets,
output_format,
serde_id
from
pods.pods_pf_hive_ah3_metastore_sds_d
where
pt='2021-08-12'
)insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl PARTITION(pt='2021-08-14')
select
a.tbl_id,
b.create_time as tbl_create_time,
b.db_id,
b.last_access_time as tbl_last_access_time,
b.owner,
b.retention,
a.sd_id,
b.tbl_name,
b.tbl_type,
b.view_expanded_text,
b.view_original_text,
b.is_rewrite_enabled,
b.owner_type as tbl_owner_type,
d.cd_id,
d.input_format,
d.is_compressed,
d.is_storedassubdirectories,
d.location as tbl_location,
d.num_buckets,
d.output_format,
d.serde_id,
a.part_id,
a.create_time as part_create_time,
a.last_access_time as part_last_access_time,
a.part_name
from tmp1 a
left join tmp3 d on a.sd_id=d.sd_id
left join tmp2 b on a.tbl_id=b.tbl_id;
{code}
The result of this SQL execution is as follows,this result is true ,select from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl table is *{color:#de350b}not null{color}*
{code:java}
hive> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl where pt='2021-08-14' and sd_id=21229815;
OK
Interrupting...
Be patient, this might take some time.
Press Ctrl+C again to kill JVM
721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -1 org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 20302818 1628697610 0 pt=2021-08-11 2021-08-14
{code}
The query result is *{color:#de350b}null{color}* first mapjoin,then left join.Query results that are not *{color:#de350b}null{color}* are first left join ,then mapjoin
I tested it again first left join,after mapjoin ,last left join .The result of the left join is *{color:#de350b}null{color}*
Test SQL as follows:
{code:java}
CREATE TABLE `pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl`(
`tbl_id` bigint COMMENT 'TBL_ID',
`tbl_create_time` bigint COMMENT 'TBL_CREATE_TIME',
`db_id` bigint COMMENT 'DB_ID',
`tbl_last_access_time` bigint COMMENT 'TBL_LAST_ACCESS_TIME',
`owner` string COMMENT 'OWNER',
`retention` bigint COMMENT 'RETENTION',
`sd_id` bigint COMMENT 'SD_ID',
`tbl_name` string COMMENT 'TBL_NAME',
`tbl_type` string COMMENT 'TBL_TYPE',
`view_expanded_text` string COMMENT 'VIEW_EXPANDED_TEXT',
`view_original_text` string COMMENT 'VIEW_ORIGINAL_TEXT',
`is_rewrite_enabled` bigint COMMENT 'IS_REWRITE_ENABLED',
`tbl_owner_type` string COMMENT 'TBL_OWNER_TYPE',
`cd_id` bigint COMMENT 'CD_ID',
`input_format` string COMMENT 'INPUT_FORMAT',
`is_compressed` bigint COMMENT 'IS_COMPRESSED',
`is_storedassubdirectories` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES',
`tbl_or_part_location` string COMMENT 'tbl_or_part_location',
`num_buckets` bigint COMMENT 'NUM_BUCKETS',
`output_format` string COMMENT 'OUTPUT_FORMAT',
`serde_id` bigint COMMENT 'SERDE_ID',
`part_id` bigint COMMENT 'PART_ID',
`part_create_time` bigint COMMENT 'PART_CREATE_TIME',
`part_last_access_time` bigint COMMENT 'PART_LAST_ACCESS_TIME',
`part_name` string COMMENT 'PART_NAME',
`cd_id_1` bigint COMMENT 'CD_ID_1',
`input_format_1` string COMMENT 'INPUT_FORMAT_1',
`is_compressed_1` bigint COMMENT 'IS_COMPRESSED_1',
`is_storedassubdirectories_1` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES_1',
`tbl_or_part_location_1` string COMMENT 'tbl_or_part_location_1',
`num_buckets_1` bigint COMMENT 'NUM_BUCKETS_1',
`output_format_1` string COMMENT 'OUTPUT_FORMAT_1',
`serde_id_1` bigint COMMENT 'SERDE_ID_1'
)
PARTITIONED BY (
`pt` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
with tmp1 as (
select
part_id,
create_time,
last_access_time,
part_name,
sd_id,
tbl_id
from
pods.pods_pf_hive_ah3_metastore_partitions_d
where
pt='2021-08-12'
),
tmp2 as (
select
tbl_id,
create_time,
db_id,
last_access_time,
owner,
retention,
sd_id,
tbl_name,
tbl_type,
view_expanded_text,
view_original_text,
is_rewrite_enabled,
owner_type
from
pods.pods_pf_hive_ah3_metastore_tbls_d
where
pt='2021-08-12'
),
tmp3 as (
select
sd_id,
cd_id,
input_format,
is_compressed,
is_storedassubdirectories,
location,
num_buckets,
output_format,
serde_id
from
pods.pods_pf_hive_ah3_metastore_sds_d
where
pt='2021-08-12'
)insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl PARTITION(pt='2021-08-14')
select
a.tbl_id,
b.create_time as tbl_create_time,
b.db_id,
b.last_access_time as tbl_last_access_time,
b.owner,
b.retention,
a.sd_id,
b.tbl_name,
b.tbl_type,
b.view_expanded_text,
b.view_original_text,
b.is_rewrite_enabled,
b.owner_type as tbl_owner_type,
d.cd_id,
d.input_format,
d.is_compressed,
d.is_storedassubdirectories,
d.location as tbl_location,
d.num_buckets,
d.output_format,
d.serde_id,
a.part_id,
a.create_time as part_create_time,
a.last_access_time as part_last_access_time,
a.part_name,
e.cd_id as cd_id_1,
e.input_format as input_format_1,
e.is_compressed as is_compressed_1,
e.is_storedassubdirectories as is_storedassubdirectories_1,
e.location as tbl_location_1,
e.num_buckets as num_buckets_1,
e.output_format as output_format_1,
e.serde_id as serde_id_1
from tmp1 a
left join tmp3 d on a.sd_id=d.sd_id
left join tmp2 b on a.tbl_id=b.tbl_id
left join tmp3 e on a.sd_id=e.sd_id
{code}
The result of this SQL execution is as follows:
{code:java}
> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl where pt='2021-08-14' and sd_id=21229815;
OK
721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -1 org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 20302818 1628697610 0 pt=2021-08-11 NULL NULL NULL NULL NULL NULL NULL NULL 2021-08-14
{code}
fields cd_id, input_format, is_compressed, is_storedassubdirectories, tbl_or_part_location, num_buckets, output_format, serde_id result {color:#de350b}*is not null*{color} ,but fields cd_id_1, input_format_1, is_compressed_1, is_storedassubdirectories_1, tbl_or_part_location_1, num_buckets_1, output_format_1, serde_id_1 result{color:#de350b} is null{color}
was:
Currently I join 3 tables, find the result of left join is *{color:#de350b}null{color}*
Here is my SQL,The result of this SQL is NULL
{code:java}
//代码占位符
CREATE TABLE `pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl`(
`tbl_id` bigint COMMENT 'TBL_ID',
`tbl_create_time` bigint COMMENT 'TBL_CREATE_TIME',
`db_id` bigint COMMENT 'DB_ID',
`tbl_last_access_time` bigint COMMENT 'TBL_LAST_ACCESS_TIME',
`owner` string COMMENT 'OWNER',
`retention` bigint COMMENT 'RETENTION',
`sd_id` bigint COMMENT 'SD_ID',
`tbl_name` string COMMENT 'TBL_NAME',
`tbl_type` string COMMENT 'TBL_TYPE',
`view_expanded_text` string COMMENT 'VIEW_EXPANDED_TEXT',
`view_original_text` string COMMENT 'VIEW_ORIGINAL_TEXT',
`is_rewrite_enabled` bigint COMMENT 'IS_REWRITE_ENABLED',
`tbl_owner_type` string COMMENT 'TBL_OWNER_TYPE',
`cd_id` bigint COMMENT 'CD_ID',
`input_format` string COMMENT 'INPUT_FORMAT',
`is_compressed` bigint COMMENT 'IS_COMPRESSED',
`is_storedassubdirectories` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES',
`tbl_or_part_location` string COMMENT 'tbl_or_part_location',
`num_buckets` bigint COMMENT 'NUM_BUCKETS',
`output_format` string COMMENT 'OUTPUT_FORMAT',
`serde_id` bigint COMMENT 'SERDE_ID',
`part_id` bigint COMMENT 'PART_ID',
`part_create_time` bigint COMMENT 'PART_CREATE_TIME',
`part_last_access_time` bigint COMMENT 'PART_LAST_ACCESS_TIME',
`part_name` string COMMENT 'PART_NAME')
PARTITIONED BY (
`pt` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
with tmp1 as (
select
part_id,
create_time,
last_access_time,
part_name,
sd_id,
tbl_id
from
pods.pods_pf_hive_ah3_metastore_partitions_d
where
pt='2021-08-12'
),
tmp2 as (
select
tbl_id,
create_time,
db_id,
last_access_time,
owner,
retention,
sd_id,
tbl_name,
tbl_type,
view_expanded_text,
view_original_text,
is_rewrite_enabled,
owner_type
from
pods.pods_pf_hive_ah3_metastore_tbls_d
where
pt='2021-08-12'
),
tmp3 as (
select
sd_id,
cd_id,
input_format,
is_compressed,
is_storedassubdirectories,
location,
num_buckets,
output_format,
serde_id
from
pods.pods_pf_hive_ah3_metastore_sds_d
where
pt='2021-08-12'
)insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl PARTITION(pt='2021-08-14')
select
a.tbl_id,
b.create_time as tbl_create_time,
b.db_id,
b.last_access_time as tbl_last_access_time,
b.owner,
b.retention,
a.sd_id,
b.tbl_name,
b.tbl_type,
b.view_expanded_text,
b.view_original_text,
b.is_rewrite_enabled,
b.owner_type as tbl_owner_type,
d.cd_id,
d.input_format,
d.is_compressed,
d.is_storedassubdirectories,
d.location as tbl_location,
d.num_buckets,
d.output_format,
d.serde_id,
a.part_id,
a.create_time as part_create_time,
a.last_access_time as part_last_access_time,
a.part_name
from tmp1 a
left join tmp2 b on a.tbl_id=b.tbl_id
left join tmp3 d on a.sd_id=d.sd_id;
{code}
pods.pods_pf_hive_ah3_metastore_partitions_d、pods.pods_pf_hive_ah3_metastore_tbls_d、pods.pods_pf_hive_ah3_metastore_sds_d from {color:#de350b}*Metastore*{color} partitions、tbls、sds
The sizes of the three tables are as follows:
80.3 M 240.9 M hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_partitions_d/pt=2021-10-09/exchangis_hive_w__2585cbd4_8bf8_4fbb_8a90_f5a7939b62b3.snappy
179.8 K 539.5 K hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_tbls_d/pt=2021-10-09/exchangis_hive_w__8a62acaa_6f82_442e_97db_ce960833612f.snappy
94.3 M 282.9 M hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_sds_d/pt=2021-10-09/exchangis_hive_w__d25536e8_7018_4262_a00e_5af3b1f88925.snappy
The result is as follows,select from pods.pods_pf_hive_ah3_metastore_sds_d table *{color:#de350b}is null{color}*
{code:java}
hive> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl where pt='2021-08-14' and sd_id=21229815;
OK
721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER NULL NULL NULL NULL NULL NULL NULL NULL 20302818 1628697610 0 pt=2021-08-11 2021-08-14
{code}
The reality pods.pods_pf_hive_ah3_metastore_sds_d table is that the data in this table is {color:#de350b}*not null*{color}
{code:java}
> select * from pods.pods_pf_hive_ah3_metastore_sds_d where pt='2021-08-12' and sd_id=21229815;
OK
Interrupting...
Be patient, this might take some time.
Press Ctrl+C again to kill JVM
21229815 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 2021-08-12
{code}
When I change the order of join,The following SQL:
{code:java}
with tmp1 as (
select
part_id,
create_time,
last_access_time,
part_name,
sd_id,
tbl_id
from
pods.pods_pf_hive_ah3_metastore_partitions_d
where
pt='2021-08-12'
),
tmp2 as (
select
tbl_id,
create_time,
db_id,
last_access_time,
owner,
retention,
sd_id,
tbl_name,
tbl_type,
view_expanded_text,
view_original_text,
is_rewrite_enabled,
owner_type
from
pods.pods_pf_hive_ah3_metastore_tbls_d
where
pt='2021-08-12'
),
tmp3 as (
select
sd_id,
cd_id,
input_format,
is_compressed,
is_storedassubdirectories,
location,
num_buckets,
output_format,
serde_id
from
pods.pods_pf_hive_ah3_metastore_sds_d
where
pt='2021-08-12'
)insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl PARTITION(pt='2021-08-14')
select
a.tbl_id,
b.create_time as tbl_create_time,
b.db_id,
b.last_access_time as tbl_last_access_time,
b.owner,
b.retention,
a.sd_id,
b.tbl_name,
b.tbl_type,
b.view_expanded_text,
b.view_original_text,
b.is_rewrite_enabled,
b.owner_type as tbl_owner_type,
d.cd_id,
d.input_format,
d.is_compressed,
d.is_storedassubdirectories,
d.location as tbl_location,
d.num_buckets,
d.output_format,
d.serde_id,
a.part_id,
a.create_time as part_create_time,
a.last_access_time as part_last_access_time,
a.part_name
from tmp1 a
left join tmp3 d on a.sd_id=d.sd_id
left join tmp2 b on a.tbl_id=b.tbl_id;
{code}
The result of this SQL execution is as follows,this result is true ,select from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl table is *{color:#de350b}not null{color}*
{code:java}
hive> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl where pt='2021-08-14' and sd_id=21229815;
OK
Interrupting...
Be patient, this might take some time.
Press Ctrl+C again to kill JVM
721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -1 org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 20302818 1628697610 0 pt=2021-08-11 2021-08-14
{code}
The query result is *{color:#de350b}null{color}* first mapjoin,then left join.Query results that are not *{color:#de350b}null{color}* are first left join ,then mapjoin
I tested it again first left join,after mapjoin ,last left join .The result of the left join is *{color:#de350b}null{color}*
Test SQL as follows:
{code:java}
CREATE TABLE `pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl`(
`tbl_id` bigint COMMENT 'TBL_ID',
`tbl_create_time` bigint COMMENT 'TBL_CREATE_TIME',
`db_id` bigint COMMENT 'DB_ID',
`tbl_last_access_time` bigint COMMENT 'TBL_LAST_ACCESS_TIME',
`owner` string COMMENT 'OWNER',
`retention` bigint COMMENT 'RETENTION',
`sd_id` bigint COMMENT 'SD_ID',
`tbl_name` string COMMENT 'TBL_NAME',
`tbl_type` string COMMENT 'TBL_TYPE',
`view_expanded_text` string COMMENT 'VIEW_EXPANDED_TEXT',
`view_original_text` string COMMENT 'VIEW_ORIGINAL_TEXT',
`is_rewrite_enabled` bigint COMMENT 'IS_REWRITE_ENABLED',
`tbl_owner_type` string COMMENT 'TBL_OWNER_TYPE',
`cd_id` bigint COMMENT 'CD_ID',
`input_format` string COMMENT 'INPUT_FORMAT',
`is_compressed` bigint COMMENT 'IS_COMPRESSED',
`is_storedassubdirectories` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES',
`tbl_or_part_location` string COMMENT 'tbl_or_part_location',
`num_buckets` bigint COMMENT 'NUM_BUCKETS',
`output_format` string COMMENT 'OUTPUT_FORMAT',
`serde_id` bigint COMMENT 'SERDE_ID',
`part_id` bigint COMMENT 'PART_ID',
`part_create_time` bigint COMMENT 'PART_CREATE_TIME',
`part_last_access_time` bigint COMMENT 'PART_LAST_ACCESS_TIME',
`part_name` string COMMENT 'PART_NAME',
`cd_id_1` bigint COMMENT 'CD_ID_1',
`input_format_1` string COMMENT 'INPUT_FORMAT_1',
`is_compressed_1` bigint COMMENT 'IS_COMPRESSED_1',
`is_storedassubdirectories_1` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES_1',
`tbl_or_part_location_1` string COMMENT 'tbl_or_part_location_1',
`num_buckets_1` bigint COMMENT 'NUM_BUCKETS_1',
`output_format_1` string COMMENT 'OUTPUT_FORMAT_1',
`serde_id_1` bigint COMMENT 'SERDE_ID_1'
)
PARTITIONED BY (
`pt` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
with tmp1 as (
select
part_id,
create_time,
last_access_time,
part_name,
sd_id,
tbl_id
from
pods.pods_pf_hive_ah3_metastore_partitions_d
where
pt='2021-08-12'
),
tmp2 as (
select
tbl_id,
create_time,
db_id,
last_access_time,
owner,
retention,
sd_id,
tbl_name,
tbl_type,
view_expanded_text,
view_original_text,
is_rewrite_enabled,
owner_type
from
pods.pods_pf_hive_ah3_metastore_tbls_d
where
pt='2021-08-12'
),
tmp3 as (
select
sd_id,
cd_id,
input_format,
is_compressed,
is_storedassubdirectories,
location,
num_buckets,
output_format,
serde_id
from
pods.pods_pf_hive_ah3_metastore_sds_d
where
pt='2021-08-12'
)insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl PARTITION(pt='2021-08-14')
select
a.tbl_id,
b.create_time as tbl_create_time,
b.db_id,
b.last_access_time as tbl_last_access_time,
b.owner,
b.retention,
a.sd_id,
b.tbl_name,
b.tbl_type,
b.view_expanded_text,
b.view_original_text,
b.is_rewrite_enabled,
b.owner_type as tbl_owner_type,
d.cd_id,
d.input_format,
d.is_compressed,
d.is_storedassubdirectories,
d.location as tbl_location,
d.num_buckets,
d.output_format,
d.serde_id,
a.part_id,
a.create_time as part_create_time,
a.last_access_time as part_last_access_time,
a.part_name,
e.cd_id as cd_id_1,
e.input_format as input_format_1,
e.is_compressed as is_compressed_1,
e.is_storedassubdirectories as is_storedassubdirectories_1,
e.location as tbl_location_1,
e.num_buckets as num_buckets_1,
e.output_format as output_format_1,
e.serde_id as serde_id_1
from tmp1 a
left join tmp3 d on a.sd_id=d.sd_id
left join tmp2 b on a.tbl_id=b.tbl_id
left join tmp3 e on a.sd_id=e.sd_id
{code}
The result of this SQL execution is as follows:
{code:java}
> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl where pt='2021-08-14' and sd_id=21229815;
OK
721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -1 org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 20302818 1628697610 0 pt=2021-08-11 NULL NULL NULL NULL NULL NULL NULL NULL 2021-08-14
{code}
fields cd_id, input_format, is_compressed, is_storedassubdirectories, tbl_or_part_location, num_buckets, output_format, serde_id result {color:#de350b}*is not null*{color} ,but fields cd_id_1, input_format_1, is_compressed_1, is_storedassubdirectories_1, tbl_or_part_location_1, num_buckets_1, output_format_1, serde_id_1 result{color:#de350b} is null{color}
> Mapjoin then join left,the result is incorrect
> ----------------------------------------------
>
> Key: HIVE-25614
> URL: https://issues.apache.org/jira/browse/HIVE-25614
> Project: Hive
> Issue Type: Bug
> Components: Hive
> Affects Versions: 3.1.2
> Reporter: zengxl
> Priority: Critical
>
> Currently I join 3 tables, find the result of left join is *{color:#de350b}null{color}*
> Here is my SQL,The result of this SQL is NULL
> {code:java}
> //代码占位符
> CREATE TABLE `pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl`(
> `tbl_id` bigint COMMENT 'TBL_ID',
> `tbl_create_time` bigint COMMENT 'TBL_CREATE_TIME',
> `db_id` bigint COMMENT 'DB_ID',
> `tbl_last_access_time` bigint COMMENT 'TBL_LAST_ACCESS_TIME',
> `owner` string COMMENT 'OWNER',
> `retention` bigint COMMENT 'RETENTION',
> `sd_id` bigint COMMENT 'SD_ID',
> `tbl_name` string COMMENT 'TBL_NAME',
> `tbl_type` string COMMENT 'TBL_TYPE',
> `view_expanded_text` string COMMENT 'VIEW_EXPANDED_TEXT',
> `view_original_text` string COMMENT 'VIEW_ORIGINAL_TEXT',
> `is_rewrite_enabled` bigint COMMENT 'IS_REWRITE_ENABLED',
> `tbl_owner_type` string COMMENT 'TBL_OWNER_TYPE',
> `cd_id` bigint COMMENT 'CD_ID',
> `input_format` string COMMENT 'INPUT_FORMAT',
> `is_compressed` bigint COMMENT 'IS_COMPRESSED',
> `is_storedassubdirectories` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES',
> `tbl_or_part_location` string COMMENT 'tbl_or_part_location',
> `num_buckets` bigint COMMENT 'NUM_BUCKETS',
> `output_format` string COMMENT 'OUTPUT_FORMAT',
> `serde_id` bigint COMMENT 'SERDE_ID',
> `part_id` bigint COMMENT 'PART_ID',
> `part_create_time` bigint COMMENT 'PART_CREATE_TIME',
> `part_last_access_time` bigint COMMENT 'PART_LAST_ACCESS_TIME',
> `part_name` string COMMENT 'PART_NAME')
> PARTITIONED BY (
> `pt` string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
>
> with tmp1 as (
> select
> part_id,
> create_time,
> last_access_time,
> part_name,
> sd_id,
> tbl_id
> from
> pods.pods_pf_hive_ah3_metastore_partitions_d
> where
> pt='2021-08-12'
> ),
> tmp2 as (
> select
> tbl_id,
> create_time,
> db_id,
> last_access_time,
> owner,
> retention,
> sd_id,
> tbl_name,
> tbl_type,
> view_expanded_text,
> view_original_text,
> is_rewrite_enabled,
> owner_type
> from
> pods.pods_pf_hive_ah3_metastore_tbls_d
> where
> pt='2021-08-12'
> ),
> tmp3 as (
> select
> sd_id,
> cd_id,
> input_format,
> is_compressed,
> is_storedassubdirectories,
> location,
> num_buckets,
> output_format,
> serde_id
> from
> pods.pods_pf_hive_ah3_metastore_sds_d
> where
> pt='2021-08-12'
> )insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl PARTITION(pt='2021-08-14')
> select
> a.tbl_id,
> b.create_time as tbl_create_time,
> b.db_id,
> b.last_access_time as tbl_last_access_time,
> b.owner,
> b.retention,
> a.sd_id,
> b.tbl_name,
> b.tbl_type,
> b.view_expanded_text,
> b.view_original_text,
> b.is_rewrite_enabled,
> b.owner_type as tbl_owner_type,
> d.cd_id,
> d.input_format,
> d.is_compressed,
> d.is_storedassubdirectories,
> d.location as tbl_location,
> d.num_buckets,
> d.output_format,
> d.serde_id,
> a.part_id,
> a.create_time as part_create_time,
> a.last_access_time as part_last_access_time,
> a.part_name
> from tmp1 a
> left join tmp2 b on a.tbl_id=b.tbl_id
> left join tmp3 d on a.sd_id=d.sd_id;
> {code}
> pods.pods_pf_hive_ah3_metastore_partitions_d、pods.pods_pf_hive_ah3_metastore_tbls_d、pods.pods_pf_hive_ah3_metastore_sds_d from {color:#de350b}*Metastore*{color} partitions、tbls、sds
> The sizes of the three tables are as follows:
> 80.3 M 240.9 M hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_partitions_d/pt=2021-10-09/exchangis_hive_w__2585cbd4_8bf8_4fbb_8a90_f5a7939b62b3.snappy
> 179.8 K 539.5 K hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_tbls_d/pt=2021-10-09/exchangis_hive_w__8a62acaa_6f82_442e_97db_ce960833612f.snappy
> 94.3 M 282.9 M hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_hive_ah3_metastore_sds_d/pt=2021-10-09/exchangis_hive_w__d25536e8_7018_4262_a00e_5af3b1f88925.snappy
> The result is as follows,select from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl table *{color:#de350b}is null{color}*
> {code:java}
> hive> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl where pt='2021-08-14' and sd_id=21229815;
> OK
> 721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER NULL NULL NULL NULL NULL NULL NULL NULL 20302818 1628697610 0 pt=2021-08-11 2021-08-14
> {code}
> The reality pods.pods_pf_hive_ah3_metastore_sds_d table is that the data in this table is {color:#de350b}*not null*{color}
> {code:java}
> > select * from pods.pods_pf_hive_ah3_metastore_sds_d where pt='2021-08-12' and sd_id=21229815;
> OK
> Interrupting...
> Be patient, this might take some time.
> Press Ctrl+C again to kill JVM
> 21229815 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 2021-08-12
> {code}
> When I change the order of join,The following SQL:
> {code:java}
> with tmp1 as (
> select
> part_id,
> create_time,
> last_access_time,
> part_name,
> sd_id,
> tbl_id
> from
> pods.pods_pf_hive_ah3_metastore_partitions_d
> where
> pt='2021-08-12'
> ),
> tmp2 as (
> select
> tbl_id,
> create_time,
> db_id,
> last_access_time,
> owner,
> retention,
> sd_id,
> tbl_name,
> tbl_type,
> view_expanded_text,
> view_original_text,
> is_rewrite_enabled,
> owner_type
> from
> pods.pods_pf_hive_ah3_metastore_tbls_d
> where
> pt='2021-08-12'
> ),
> tmp3 as (
> select
> sd_id,
> cd_id,
> input_format,
> is_compressed,
> is_storedassubdirectories,
> location,
> num_buckets,
> output_format,
> serde_id
> from
> pods.pods_pf_hive_ah3_metastore_sds_d
> where
> pt='2021-08-12'
> )insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl PARTITION(pt='2021-08-14')
> select
> a.tbl_id,
> b.create_time as tbl_create_time,
> b.db_id,
> b.last_access_time as tbl_last_access_time,
> b.owner,
> b.retention,
> a.sd_id,
> b.tbl_name,
> b.tbl_type,
> b.view_expanded_text,
> b.view_original_text,
> b.is_rewrite_enabled,
> b.owner_type as tbl_owner_type,
> d.cd_id,
> d.input_format,
> d.is_compressed,
> d.is_storedassubdirectories,
> d.location as tbl_location,
> d.num_buckets,
> d.output_format,
> d.serde_id,
> a.part_id,
> a.create_time as part_create_time,
> a.last_access_time as part_last_access_time,
> a.part_name
> from tmp1 a
> left join tmp3 d on a.sd_id=d.sd_id
> left join tmp2 b on a.tbl_id=b.tbl_id;
> {code}
> The result of this SQL execution is as follows,this result is true ,select from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl table is *{color:#de350b}not null{color}*
>
> {code:java}
> hive> select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_3_tbl where pt='2021-08-14' and sd_id=21229815;
> OK
> Interrupting...
> Be patient, this might take some time.
> Press Ctrl+C again to kill JVM
> 721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -1 org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 20302818 1628697610 0 pt=2021-08-11 2021-08-14
> {code}
> The query result is *{color:#de350b}null{color}* first mapjoin,then left join.Query results that are not *{color:#de350b}null{color}* are first left join ,then mapjoin
> I tested it again first left join,after mapjoin ,last left join .The result of the left join is *{color:#de350b}null{color}*
> Test SQL as follows:
> {code:java}
> CREATE TABLE `pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl`(
> `tbl_id` bigint COMMENT 'TBL_ID',
> `tbl_create_time` bigint COMMENT 'TBL_CREATE_TIME',
> `db_id` bigint COMMENT 'DB_ID',
> `tbl_last_access_time` bigint COMMENT 'TBL_LAST_ACCESS_TIME',
> `owner` string COMMENT 'OWNER',
> `retention` bigint COMMENT 'RETENTION',
> `sd_id` bigint COMMENT 'SD_ID',
> `tbl_name` string COMMENT 'TBL_NAME',
> `tbl_type` string COMMENT 'TBL_TYPE',
> `view_expanded_text` string COMMENT 'VIEW_EXPANDED_TEXT',
> `view_original_text` string COMMENT 'VIEW_ORIGINAL_TEXT',
> `is_rewrite_enabled` bigint COMMENT 'IS_REWRITE_ENABLED',
> `tbl_owner_type` string COMMENT 'TBL_OWNER_TYPE',
> `cd_id` bigint COMMENT 'CD_ID',
> `input_format` string COMMENT 'INPUT_FORMAT',
> `is_compressed` bigint COMMENT 'IS_COMPRESSED',
> `is_storedassubdirectories` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES',
> `tbl_or_part_location` string COMMENT 'tbl_or_part_location',
> `num_buckets` bigint COMMENT 'NUM_BUCKETS',
> `output_format` string COMMENT 'OUTPUT_FORMAT',
> `serde_id` bigint COMMENT 'SERDE_ID',
> `part_id` bigint COMMENT 'PART_ID',
> `part_create_time` bigint COMMENT 'PART_CREATE_TIME',
> `part_last_access_time` bigint COMMENT 'PART_LAST_ACCESS_TIME',
> `part_name` string COMMENT 'PART_NAME',
> `cd_id_1` bigint COMMENT 'CD_ID_1',
> `input_format_1` string COMMENT 'INPUT_FORMAT_1',
> `is_compressed_1` bigint COMMENT 'IS_COMPRESSED_1',
> `is_storedassubdirectories_1` bigint COMMENT 'IS_STOREDASSUBDIRECTORIES_1',
> `tbl_or_part_location_1` string COMMENT 'tbl_or_part_location_1',
> `num_buckets_1` bigint COMMENT 'NUM_BUCKETS_1',
> `output_format_1` string COMMENT 'OUTPUT_FORMAT_1',
> `serde_id_1` bigint COMMENT 'SERDE_ID_1'
> )
> PARTITIONED BY (
> `pt` string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat';
>
> with tmp1 as (
> select
> part_id,
> create_time,
> last_access_time,
> part_name,
> sd_id,
> tbl_id
> from
> pods.pods_pf_hive_ah3_metastore_partitions_d
> where
> pt='2021-08-12'
> ),
> tmp2 as (
> select
> tbl_id,
> create_time,
> db_id,
> last_access_time,
> owner,
> retention,
> sd_id,
> tbl_name,
> tbl_type,
> view_expanded_text,
> view_original_text,
> is_rewrite_enabled,
> owner_type
> from
> pods.pods_pf_hive_ah3_metastore_tbls_d
> where
> pt='2021-08-12'
> ),
> tmp3 as (
> select
> sd_id,
> cd_id,
> input_format,
> is_compressed,
> is_storedassubdirectories,
> location,
> num_buckets,
> output_format,
> serde_id
> from
> pods.pods_pf_hive_ah3_metastore_sds_d
> where
> pt='2021-08-12'
> )insert overwrite table pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl PARTITION(pt='2021-08-14')
> select
> a.tbl_id,
> b.create_time as tbl_create_time,
> b.db_id,
> b.last_access_time as tbl_last_access_time,
> b.owner,
> b.retention,
> a.sd_id,
> b.tbl_name,
> b.tbl_type,
> b.view_expanded_text,
> b.view_original_text,
> b.is_rewrite_enabled,
> b.owner_type as tbl_owner_type,
> d.cd_id,
> d.input_format,
> d.is_compressed,
> d.is_storedassubdirectories,
> d.location as tbl_location,
> d.num_buckets,
> d.output_format,
> d.serde_id,
> a.part_id,
> a.create_time as part_create_time,
> a.last_access_time as part_last_access_time,
> a.part_name,
> e.cd_id as cd_id_1,
> e.input_format as input_format_1,
> e.is_compressed as is_compressed_1,
> e.is_storedassubdirectories as is_storedassubdirectories_1,
> e.location as tbl_location_1,
> e.num_buckets as num_buckets_1,
> e.output_format as output_format_1,
> e.serde_id as serde_id_1
> from tmp1 a
> left join tmp3 d on a.sd_id=d.sd_id
> left join tmp2 b on a.tbl_id=b.tbl_id
> left join tmp3 e on a.sd_id=e.sd_id
> {code}
> The result of this SQL execution is as follows:
> {code:java}
> > select * from pdwd.pdwd_pf_hive_ah3_metastore_tbl_partitions_d_test_0831_4_tbl where pt='2021-08-14' and sd_id=21229815;
> OK
> 721213 1619403652 528181 0 op 0 21229815 pods_pf_yarn_logic_res_i_d MANAGED_TABLE NULL NULL 0 USER 726105 org.apache.hadoop.hive.ql.io.orc.OrcInputFormat 0 0 hdfs://ctyunns4/apps/hive/warehouse/pods/pods_pf_yarn_logic_res_i_d/pt=2021-08-11 -1 org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat 21229815 20302818 1628697610 0 pt=2021-08-11 NULL NULL NULL NULL NULL NULL NULL NULL 2021-08-14
> {code}
> fields cd_id, input_format, is_compressed, is_storedassubdirectories, tbl_or_part_location, num_buckets, output_format, serde_id result {color:#de350b}*is not null*{color} ,but fields cd_id_1, input_format_1, is_compressed_1, is_storedassubdirectories_1, tbl_or_part_location_1, num_buckets_1, output_format_1, serde_id_1 result{color:#de350b} is null{color}
>
>
>
>
>
--
This message was sent by Atlassian Jira
(v8.3.4#803005)