You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hive.apache.org by "Illya Yalovyy (JIRA)" <ji...@apache.org> on 2015/06/11 17:39:00 UTC
[jira] [Updated] (HIVE-10980) Merge of dynamic partitions loads all
data to default partition
[ https://issues.apache.org/jira/browse/HIVE-10980?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Illya Yalovyy updated HIVE-10980:
---------------------------------
Description:
Conditions that lead to the issue:
1. Execution engine set to MapReduce
2. Partition columns have different types
3. Both static and dynamic partitions are used in the query
4. Dynamically generated partitions require merge
Result: Final data is loaded to "__HIVE_DEFAULT_PARTITION__".
Steps to reproduce:
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=strict;
set hive.optimize.sort.dynamic.partition=false;
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set hive.execution.engine=mr;
create external table sdp (
dataint bigint,
hour int,
req string,
cid string,
caid string
)
row format delimited
fields terminated by ',';
load data local inpath '../../data/files/dynpartdata1.txt' into table sdp;
load data local inpath '../../data/files/dynpartdata2.txt' into table sdp;
...
load data local inpath '../../data/files/dynpartdataN.txt' into table sdp;
create table tdp (cid string, caid string)
partitioned by (dataint bigint, hour int, req string);
insert overwrite table tdp partition (dataint=20150316, hour=16, req)
select cid, caid, req from sdp where dataint=20150316 and hour=16;
select * from tdp order by caid;
show partitions tdp;
Example of the input file:
20150316,16,reqA,clusterIdA,cacheId1
20150316,16,reqB,clusterIdB,cacheId2
20150316,16,reqA,clusterIdC,cacheId3
20150316,16,reqD,clusterIdD,cacheId4
20150316,16,reqA,clusterIdA,cacheId5
Actual result:
clusterIdA cacheId1 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdA cacheId1 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdB cacheId2 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdC cacheId3 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdD cacheId4 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdA cacheId5 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdD cacheId8 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdB cacheId9 20150316 16 __HIVE_DEFAULT_PARTITION__
dataint=20150316/hour=16/req=__HIVE_DEFAULT_PARTITION__
was:
Conditions that lead to the issue:
1. Partition columns have different types
2. Both static and dynamic partitions are used in the query
3. Dynamically generated partitions require merge
Result: Final data is loaded to "__HIVE_DEFAULT_PARTITION__".
Steps to reproduce:
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=strict;
set hive.optimize.sort.dynamic.partition=false;
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
create external table sdp (
dataint bigint,
hour int,
req string,
cid string,
caid string
)
row format delimited
fields terminated by ',';
load data local inpath '../../data/files/dynpartdata1.txt' into table sdp;
load data local inpath '../../data/files/dynpartdata2.txt' into table sdp;
...
load data local inpath '../../data/files/dynpartdataN.txt' into table sdp;
create table tdp (cid string, caid string)
partitioned by (dataint bigint, hour int, req string);
insert overwrite table tdp partition (dataint=20150316, hour=16, req)
select cid, caid, req from sdp where dataint=20150316 and hour=16;
select * from tdp order by caid;
show partitions tdp;
Example of the input file:
20150316,16,reqA,clusterIdA,cacheId1
20150316,16,reqB,clusterIdB,cacheId2
20150316,16,reqA,clusterIdC,cacheId3
20150316,16,reqD,clusterIdD,cacheId4
20150316,16,reqA,clusterIdA,cacheId5
Actual result:
clusterIdA cacheId1 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdA cacheId1 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdB cacheId2 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdC cacheId3 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdD cacheId4 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdA cacheId5 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdD cacheId8 20150316 16 __HIVE_DEFAULT_PARTITION__
clusterIdB cacheId9 20150316 16 __HIVE_DEFAULT_PARTITION__
dataint=20150316/hour=16/req=__HIVE_DEFAULT_PARTITION__
> Merge of dynamic partitions loads all data to default partition
> ---------------------------------------------------------------
>
> Key: HIVE-10980
> URL: https://issues.apache.org/jira/browse/HIVE-10980
> Project: Hive
> Issue Type: Bug
> Components: Hive
> Affects Versions: 0.14.0
> Environment: HDP 2.2.4 (also reproduced on apache hive built from trunk)
> Reporter: Illya Yalovyy
>
> Conditions that lead to the issue:
> 1. Execution engine set to MapReduce
> 2. Partition columns have different types
> 3. Both static and dynamic partitions are used in the query
> 4. Dynamically generated partitions require merge
> Result: Final data is loaded to "__HIVE_DEFAULT_PARTITION__".
> Steps to reproduce:
> set hive.exec.dynamic.partition=true;
> set hive.exec.dynamic.partition.mode=strict;
> set hive.optimize.sort.dynamic.partition=false;
> set hive.merge.mapfiles=true;
> set hive.merge.mapredfiles=true;
> set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
> set hive.execution.engine=mr;
> create external table sdp (
> dataint bigint,
> hour int,
> req string,
> cid string,
> caid string
> )
> row format delimited
> fields terminated by ',';
> load data local inpath '../../data/files/dynpartdata1.txt' into table sdp;
> load data local inpath '../../data/files/dynpartdata2.txt' into table sdp;
> ...
> load data local inpath '../../data/files/dynpartdataN.txt' into table sdp;
> create table tdp (cid string, caid string)
> partitioned by (dataint bigint, hour int, req string);
> insert overwrite table tdp partition (dataint=20150316, hour=16, req)
> select cid, caid, req from sdp where dataint=20150316 and hour=16;
> select * from tdp order by caid;
> show partitions tdp;
> Example of the input file:
> 20150316,16,reqA,clusterIdA,cacheId1
> 20150316,16,reqB,clusterIdB,cacheId2
> 20150316,16,reqA,clusterIdC,cacheId3
> 20150316,16,reqD,clusterIdD,cacheId4
> 20150316,16,reqA,clusterIdA,cacheId5
> Actual result:
> clusterIdA cacheId1 20150316 16 __HIVE_DEFAULT_PARTITION__
> clusterIdA cacheId1 20150316 16 __HIVE_DEFAULT_PARTITION__
> clusterIdB cacheId2 20150316 16 __HIVE_DEFAULT_PARTITION__
> clusterIdC cacheId3 20150316 16 __HIVE_DEFAULT_PARTITION__
> clusterIdD cacheId4 20150316 16 __HIVE_DEFAULT_PARTITION__
> clusterIdA cacheId5 20150316 16 __HIVE_DEFAULT_PARTITION__
> clusterIdD cacheId8 20150316 16 __HIVE_DEFAULT_PARTITION__
> clusterIdB cacheId9 20150316 16 __HIVE_DEFAULT_PARTITION__
> dataint=20150316/hour=16/req=__HIVE_DEFAULT_PARTITION__
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)