You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hive.apache.org by "Zoltan Haindrich (JIRA)" <ji...@apache.org> on 2018/05/16 12:24:00 UTC

[jira] [Updated] (HIVE-19570) Multiple inserts using "Group by" generates incorrect results

     [ https://issues.apache.org/jira/browse/HIVE-19570?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Zoltan Haindrich updated HIVE-19570:
------------------------------------
    Description: 
Repro steps:

{code}
drop database if exists ax1 cascade;
create database ax1;
use ax1;

CREATE TABLE 
	tmp1 ( 
		v1 string , v2 string , v3 string ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

INSERT INTO tmp1
VALUES 
('a', 'b', 'c1') 
, ('a', 'b', 'c2') 
, ('d', 'e', 'f') 
, ('g', 'h', 'i') 
;

CREATE TABLE 
tmp_grouped_by_one_col  ( v1 string , cnt__v2 int , cnt__v3 int ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

CREATE TABLE 
tmp_grouped_by_two_col ( v1 string , v2 string , cnt__v3 int ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

CREATE TABLE 
tmp_grouped_by_all_col ( v1 string , v2 string , v3 string ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

FROM tmp1
INSERT INTO tmp_grouped_by_one_col 
SELECT v1, count(distinct v2), count(distinct v3) 
GROUP BY v1
INSERT INTO tmp_grouped_by_all_col 
SELECT v1, v2, v3
GROUP BY v1, v2, v3
;

select 'tmp_grouped_by_one_col',count(*) from tmp_grouped_by_one_col
union all
select 'tmp_grouped_by_two_col',count(*) from tmp_grouped_by_two_col
union all
select 'tmp_grouped_by_all_col',count(*) from tmp_grouped_by_all_col;

select * from tmp_grouped_by_all_col;
{code}

tmp_grouped_by_all_col table should have 4 reocrds but it loads 7 records into the table.

{code}
+----------------------------+----------------------------+----------------------------+--+
| tmp_grouped_by_all_col.v1  | tmp_grouped_by_all_col.v2  | tmp_grouped_by_all_col.v3  |
+----------------------------+----------------------------+----------------------------+--+
| a                          | b                          | b                          |
| a                          | c1                         | c1                         |
| a                          | c2                         | c2                         |
| d                          | e                          | e                          |
| d                          | f                          | f                          |
| g                          | h                          | h                          |
| g                          | i                          | i                          |
+----------------------------+----------------------------+----------------------------+--+
{code}



  was:
Repro steps:

drop database if exists ax1 cascade;
create database ax1;
use ax1;

CREATE TABLE 
	tmp1 ( 
		v1 string , v2 string , v3 string ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

INSERT INTO tmp1
VALUES 
('a', 'b', 'c1') 
, ('a', 'b', 'c2') 
, ('d', 'e', 'f') 
, ('g', 'h', 'i') 
;

CREATE TABLE 
tmp_grouped_by_one_col  ( v1 string , cnt__v2 int , cnt__v3 int ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

CREATE TABLE 
tmp_grouped_by_two_col ( v1 string , v2 string , cnt__v3 int ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

CREATE TABLE 
tmp_grouped_by_all_col ( v1 string , v2 string , v3 string ) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\t' 
LINES TERMINATED BY '\n' 
;

FROM tmp1
INSERT INTO tmp_grouped_by_one_col 
SELECT v1, count(distinct v2), count(distinct v3) 
GROUP BY v1
INSERT INTO tmp_grouped_by_all_col 
SELECT v1, v2, v3
GROUP BY v1, v2, v3
;

select 'tmp_grouped_by_one_col',count(*) from tmp_grouped_by_one_col
union all
select 'tmp_grouped_by_two_col',count(*) from tmp_grouped_by_two_col
union all
select 'tmp_grouped_by_all_col',count(*) from tmp_grouped_by_all_col;

select * from tmp_grouped_by_all_col;

tmp_grouped_by_all_col table should have 4 reocrds but it loads 7 records into the table.

+----------------------------+----------------------------+----------------------------+--+
| tmp_grouped_by_all_col.v1  | tmp_grouped_by_all_col.v2  | tmp_grouped_by_all_col.v3  |
+----------------------------+----------------------------+----------------------------+--+
| a                          | b                          | b                          |
| a                          | c1                         | c1                         |
| a                          | c2                         | c2                         |
| d                          | e                          | e                          |
| d                          | f                          | f                          |
| g                          | h                          | h                          |
| g                          | i                          | i                          |
+----------------------------+----------------------------+----------------------------+--+





> Multiple inserts using "Group by" generates incorrect results
> -------------------------------------------------------------
>
>                 Key: HIVE-19570
>                 URL: https://issues.apache.org/jira/browse/HIVE-19570
>             Project: Hive
>          Issue Type: Bug
>          Components: Logical Optimizer, Query Processor
>    Affects Versions: 1.2.0, 3.0.0
>            Reporter: Riju Trivedi
>            Priority: Critical
>
> Repro steps:
> {code}
> drop database if exists ax1 cascade;
> create database ax1;
> use ax1;
> CREATE TABLE 
> 	tmp1 ( 
> 		v1 string , v2 string , v3 string ) 
> ROW FORMAT DELIMITED 
> FIELDS TERMINATED BY '\t' 
> LINES TERMINATED BY '\n' 
> ;
> INSERT INTO tmp1
> VALUES 
> ('a', 'b', 'c1') 
> , ('a', 'b', 'c2') 
> , ('d', 'e', 'f') 
> , ('g', 'h', 'i') 
> ;
> CREATE TABLE 
> tmp_grouped_by_one_col  ( v1 string , cnt__v2 int , cnt__v3 int ) 
> ROW FORMAT DELIMITED 
> FIELDS TERMINATED BY '\t' 
> LINES TERMINATED BY '\n' 
> ;
> CREATE TABLE 
> tmp_grouped_by_two_col ( v1 string , v2 string , cnt__v3 int ) 
> ROW FORMAT DELIMITED 
> FIELDS TERMINATED BY '\t' 
> LINES TERMINATED BY '\n' 
> ;
> CREATE TABLE 
> tmp_grouped_by_all_col ( v1 string , v2 string , v3 string ) 
> ROW FORMAT DELIMITED 
> FIELDS TERMINATED BY '\t' 
> LINES TERMINATED BY '\n' 
> ;
> FROM tmp1
> INSERT INTO tmp_grouped_by_one_col 
> SELECT v1, count(distinct v2), count(distinct v3) 
> GROUP BY v1
> INSERT INTO tmp_grouped_by_all_col 
> SELECT v1, v2, v3
> GROUP BY v1, v2, v3
> ;
> select 'tmp_grouped_by_one_col',count(*) from tmp_grouped_by_one_col
> union all
> select 'tmp_grouped_by_two_col',count(*) from tmp_grouped_by_two_col
> union all
> select 'tmp_grouped_by_all_col',count(*) from tmp_grouped_by_all_col;
> select * from tmp_grouped_by_all_col;
> {code}
> tmp_grouped_by_all_col table should have 4 reocrds but it loads 7 records into the table.
> {code}
> +----------------------------+----------------------------+----------------------------+--+
> | tmp_grouped_by_all_col.v1  | tmp_grouped_by_all_col.v2  | tmp_grouped_by_all_col.v3  |
> +----------------------------+----------------------------+----------------------------+--+
> | a                          | b                          | b                          |
> | a                          | c1                         | c1                         |
> | a                          | c2                         | c2                         |
> | d                          | e                          | e                          |
> | d                          | f                          | f                          |
> | g                          | h                          | h                          |
> | g                          | i                          | i                          |
> +----------------------------+----------------------------+----------------------------+--+
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)