You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yong Zhang (JIRA)" <ji...@apache.org> on 2017/03/29 01:35:41 UTC
[jira] [Comment Edited] (SPARK-20093) Exception when Joining
dataframe with another dataframe generated by applying groupBy
transformation on original one
[ https://issues.apache.org/jira/browse/SPARK-20093?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15946375#comment-15946375 ]
Yong Zhang edited comment on SPARK-20093 at 3/29/17 1:34 AM:
-------------------------------------------------------------
This problem exists. It looks like if switching the order of join, then it works fine.
scala> spark.version
res16: String = 2.1.0
scala> groupDF.join(df, groupDF("height") === df("height")).show
org.apache.spark.sql.AnalysisException: resolved attribute(s) height#8 missing from height#181,gender#7,height#338,weight#339,gender#337 in operator !Join Inner, (height#181 = height#8);;
!Join Inner, (height#181 = height#8)
:- Aggregate [gender#7], [gender#7, min(height#8) AS height#181]
: +- Project [_1#3 AS gender#7, _2#4 AS height#8, _3#5 AS weight#9]
: +- LocalRelation [_1#3, _2#4, _3#5]
+- Project [_1#3 AS gender#337, _2#4 AS height#338, _3#5 AS weight#339]
+- LocalRelation [_1#3, _2#4, _3#5]
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:40)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:57)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:337)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:67)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:128)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:67)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:57)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:48)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:63)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:2822)
at org.apache.spark.sql.Dataset.join(Dataset.scala:830)
at org.apache.spark.sql.Dataset.join(Dataset.scala:796)
... 48 elided
scala> df.join(groupDF, groupDF("height") === df("height")).show
+------+------+------+------+------+
|gender|height|weight|gender|height|
+------+------+------+------+------+
| M| 160| 55| M| 160|
| F| 150| 53| F| 150|
+------+------+------+------+------+
was (Author: java8964):
This problem exists. It looks like if switch the order of join, then it works fine.
scala> spark.version
res16: String = 2.1.0
scala> groupDF.join(df, groupDF("height") === df("height")).show
org.apache.spark.sql.AnalysisException: resolved attribute(s) height#8 missing from height#181,gender#7,height#338,weight#339,gender#337 in operator !Join Inner, (height#181 = height#8);;
!Join Inner, (height#181 = height#8)
:- Aggregate [gender#7], [gender#7, min(height#8) AS height#181]
: +- Project [_1#3 AS gender#7, _2#4 AS height#8, _3#5 AS weight#9]
: +- LocalRelation [_1#3, _2#4, _3#5]
+- Project [_1#3 AS gender#337, _2#4 AS height#338, _3#5 AS weight#339]
+- LocalRelation [_1#3, _2#4, _3#5]
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:40)
at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:57)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:337)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:67)
at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:128)
at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:67)
at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:57)
at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:48)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:63)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:2822)
at org.apache.spark.sql.Dataset.join(Dataset.scala:830)
at org.apache.spark.sql.Dataset.join(Dataset.scala:796)
... 48 elided
scala> df.join(groupDF, groupDF("height") === df("height")).show
+------+------+------+------+------+
|gender|height|weight|gender|height|
+------+------+------+------+------+
| M| 160| 55| M| 160|
| F| 150| 53| F| 150|
+------+------+------+------+------+
> Exception when Joining dataframe with another dataframe generated by applying groupBy transformation on original one
> --------------------------------------------------------------------------------------------------------------------
>
> Key: SPARK-20093
> URL: https://issues.apache.org/jira/browse/SPARK-20093
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.2.0
> Reporter: Hosur Narahari
>
> When we generate a dataframe by doing grouping, and perform join on original dataframe with aggregate column, we get AnalysisException. Below I've attached a piece of code and resulting exception to reproduce.
> Code:
> import org.apache.spark.sql.SparkSession
> object App {
> lazy val spark = SparkSession.builder.appName("Test").master("local").getOrCreate
> def main(args: Array[String]): Unit = {
> test1
> }
> private def test1 {
> import org.apache.spark.sql.functions._
> val df = spark.createDataFrame(Seq(("M",172,60), ("M", 170, 60), ("F", 155, 56), ("M", 160, 55), ("F", 150, 53))).toDF("gender", "height", "weight")
> val groupDF = df.groupBy("gender").agg(min("height").as("height"))
> groupDF.show()
> val out = groupDF.join(df, groupDF("height") <=> df("height")).select(df("gender"), df("height"), df("weight"))
> out.show
> }
> }
> When I ran above code, I got below exception:
> Exception in thread "main" org.apache.spark.sql.AnalysisException: resolved attribute(s) height#8 missing from height#19,height#30,gender#29,weight#31,gender#7 in operator !Join Inner, (height#19 <=> height#8);;
> !Join Inner, (height#19 <=> height#8)
> :- Aggregate [gender#7], [gender#7, min(height#8) AS height#19]
> : +- Project [_1#0 AS gender#7, _2#1 AS height#8, _3#2 AS weight#9]
> : +- LocalRelation [_1#0, _2#1, _3#2]
> +- Project [_1#0 AS gender#29, _2#1 AS height#30, _3#2 AS weight#31]
> +- LocalRelation [_1#0, _2#1, _3#2]
> at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:39)
> at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:90)
> at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:342)
> at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:78)
> at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)
> at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:78)
> at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:90)
> at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:53)
> at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:67)
> at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:2831)
> at org.apache.spark.sql.Dataset.join(Dataset.scala:843)
> at org.apache.spark.sql.Dataset.join(Dataset.scala:807)
> at App$.test1(App.scala:17)
> at App$.main(App.scala:9)
> at App.main(App.scala)
> Please someone look into it.
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org