You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hive.apache.org by "gumengchao (Jira)" <ji...@apache.org> on 2020/04/29 04:08:00 UTC

[jira] [Updated] (HIVE-23304) Two SQL execution results with the same semantics are not the same

     [ https://issues.apache.org/jira/browse/HIVE-23304?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

gumengchao updated HIVE-23304:
------------------------------
    Description: 
The simplified question is  

 

execute sql

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a
	INNER JOIN (
		SELECT a.movieid AS movieid, a.userid AS userid, a.rate
		FROM t_rating a
			JOIN (
				SELECT COUNT(*) AS rate_times, a.userid AS userid
				FROM t_rating a
					JOIN (
						SELECT *
						FROM t_user
						WHERE sex = 'F'
					) b
					ON a.userid = b.userid
				GROUP BY a.userid
				ORDER BY rate_times DESC
				LIMIT 0, 1
			) b
			ON a.userid = b.userid
		ORDER BY a.rate DESC
		LIMIT 0, 10
	) b
	ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
The result is below.But this is not the result I want 
{noformat}
4.063136456211812	162	1150
4.476190476190476	904	1150
4.249370277078086	951	1150
4.14167916041979	1230	1150
3.6464646464646466	1966	1150
4.163043478260869	2330	1150
3.7039473684210527	3163	1150
4.387453874538745	3307	1150
4.047363717605005	3671	1150
3.8265682656826567	3675	1150{noformat}
So I divided sql into two steps

fisrt: create temporary table with subquery

 
{noformat}
create temporary table tmp as
SELECT a.movieid AS movieid, a.userid AS userid, a.rate
		FROM t_rating a
			JOIN (
				SELECT COUNT(*) AS rate_times, a.userid AS userid
				FROM t_rating a
					JOIN (
						SELECT *
						FROM t_user
						WHERE sex = 'F'
					) b
					ON a.userid = b.userid
				GROUP BY a.userid
				ORDER BY rate_times DESC
				LIMIT 0, 1
			) b
			ON a.userid = b.userid
		ORDER BY a.rate DESC
		LIMIT 0, 10{noformat}
second: use temporary table replace subquery

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a INNER JOIN tmp b
	ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
the result 
{noformat}
4.52054794520548	745	1150
4.4498902706656915	750	1150
4.476190476190476	904	1150
4.280748663101604	905	1150
3.7314890154597236	1094	1150
4.188888888888889	1236	1150
4.21043771043771	1256	1150
3.747422680412371	1279	1150
4.0739348370927315	2064	1150
4.125390450691656	2997	1150{noformat}
Why the results of two executions are different?

 

 

 

 

 

 

  was:
execute sql

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a
	INNER JOIN (
		SELECT a.movieid AS movieid, a.userid AS userid, a.rate
		FROM t_rating a
			JOIN (
				SELECT COUNT(*) AS rate_times, a.userid AS userid
				FROM t_rating a
					JOIN (
						SELECT *
						FROM t_user
						WHERE sex = 'F'
					) b
					ON a.userid = b.userid
				GROUP BY a.userid
				ORDER BY rate_times DESC
				LIMIT 0, 1
			) b
			ON a.userid = b.userid
		ORDER BY a.rate DESC
		LIMIT 0, 10
	) b
	ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
The result is below.But this is not the result I want 
{noformat}
4.063136456211812	162	1150
4.476190476190476	904	1150
4.249370277078086	951	1150
4.14167916041979	1230	1150
3.6464646464646466	1966	1150
4.163043478260869	2330	1150
3.7039473684210527	3163	1150
4.387453874538745	3307	1150
4.047363717605005	3671	1150
3.8265682656826567	3675	1150{noformat}
So I divided sql into two steps

fisrt:

 
{noformat}
create temporary table tmp as
SELECT a.movieid AS movieid, a.userid AS userid, a.rate
		FROM t_rating a
			JOIN (
				SELECT COUNT(*) AS rate_times, a.userid AS userid
				FROM t_rating a
					JOIN (
						SELECT *
						FROM t_user
						WHERE sex = 'F'
					) b
					ON a.userid = b.userid
				GROUP BY a.userid
				ORDER BY rate_times DESC
				LIMIT 0, 1
			) b
			ON a.userid = b.userid
		ORDER BY a.rate DESC
		LIMIT 0, 10{noformat}
second:

 
{noformat}
SELECT AVG(a.rate), a.movieid, MAX(b.userid)
FROM t_rating a INNER JOIN tmp b
	ON a.movieid = b.movieid
GROUP BY a.movieid{noformat}
the result 
{noformat}
4.52054794520548	745	1150
4.4498902706656915	750	1150
4.476190476190476	904	1150
4.280748663101604	905	1150
3.7314890154597236	1094	1150
4.188888888888889	1236	1150
4.21043771043771	1256	1150
3.747422680412371	1279	1150
4.0739348370927315	2064	1150
4.125390450691656	2997	1150{noformat}
Why the results of two executions are different?

 

 

 

 

 

 


> Two SQL execution results with the same semantics are not the same
> ------------------------------------------------------------------
>
>                 Key: HIVE-23304
>                 URL: https://issues.apache.org/jira/browse/HIVE-23304
>             Project: Hive
>          Issue Type: Bug
>          Components: Hive, SQL
>    Affects Versions: 2.3.7
>         Environment: hadoop version 2.7.7
> hive version 2.3.7
> hive.execution.engine=mr
>  
>            Reporter: gumengchao
>            Priority: Critical
>
> The simplified question is  
>  
> execute sql
>  
> {noformat}
> SELECT AVG(a.rate), a.movieid, MAX(b.userid)
> FROM t_rating a
> 	INNER JOIN (
> 		SELECT a.movieid AS movieid, a.userid AS userid, a.rate
> 		FROM t_rating a
> 			JOIN (
> 				SELECT COUNT(*) AS rate_times, a.userid AS userid
> 				FROM t_rating a
> 					JOIN (
> 						SELECT *
> 						FROM t_user
> 						WHERE sex = 'F'
> 					) b
> 					ON a.userid = b.userid
> 				GROUP BY a.userid
> 				ORDER BY rate_times DESC
> 				LIMIT 0, 1
> 			) b
> 			ON a.userid = b.userid
> 		ORDER BY a.rate DESC
> 		LIMIT 0, 10
> 	) b
> 	ON a.movieid = b.movieid
> GROUP BY a.movieid{noformat}
> The result is below.But this is not the result I want 
> {noformat}
> 4.063136456211812	162	1150
> 4.476190476190476	904	1150
> 4.249370277078086	951	1150
> 4.14167916041979	1230	1150
> 3.6464646464646466	1966	1150
> 4.163043478260869	2330	1150
> 3.7039473684210527	3163	1150
> 4.387453874538745	3307	1150
> 4.047363717605005	3671	1150
> 3.8265682656826567	3675	1150{noformat}
> So I divided sql into two steps
> fisrt: create temporary table with subquery
>  
> {noformat}
> create temporary table tmp as
> SELECT a.movieid AS movieid, a.userid AS userid, a.rate
> 		FROM t_rating a
> 			JOIN (
> 				SELECT COUNT(*) AS rate_times, a.userid AS userid
> 				FROM t_rating a
> 					JOIN (
> 						SELECT *
> 						FROM t_user
> 						WHERE sex = 'F'
> 					) b
> 					ON a.userid = b.userid
> 				GROUP BY a.userid
> 				ORDER BY rate_times DESC
> 				LIMIT 0, 1
> 			) b
> 			ON a.userid = b.userid
> 		ORDER BY a.rate DESC
> 		LIMIT 0, 10{noformat}
> second: use temporary table replace subquery
>  
> {noformat}
> SELECT AVG(a.rate), a.movieid, MAX(b.userid)
> FROM t_rating a INNER JOIN tmp b
> 	ON a.movieid = b.movieid
> GROUP BY a.movieid{noformat}
> the result 
> {noformat}
> 4.52054794520548	745	1150
> 4.4498902706656915	750	1150
> 4.476190476190476	904	1150
> 4.280748663101604	905	1150
> 3.7314890154597236	1094	1150
> 4.188888888888889	1236	1150
> 4.21043771043771	1256	1150
> 3.747422680412371	1279	1150
> 4.0739348370927315	2064	1150
> 4.125390450691656	2997	1150{noformat}
> Why the results of two executions are different?
>  
>  
>  
>  
>  
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)