You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Takeshi Yamamuro (JIRA)" <ji...@apache.org> on 2017/05/20 15:01:04 UTC
[jira] [Closed] (SPARK-20804) Join with null safe equality fails
with AnalysisException
[ https://issues.apache.org/jira/browse/SPARK-20804?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Takeshi Yamamuro closed SPARK-20804.
------------------------------------
Resolution: Duplicate
> Join with null safe equality fails with AnalysisException
> ---------------------------------------------------------
>
> Key: SPARK-20804
> URL: https://issues.apache.org/jira/browse/SPARK-20804
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 2.2.0
> Environment: org.apache.spark#spark-sql_2.11;2.3.0-SNAPSHOT from asf snapshots, Mon May 15 08:09:18 EDT 2017
> Reporter: koert kuipers
> Priority: Minor
>
> {noformat}
> val x = Seq(("a", 1), ("a", 2), (null, 1)).toDF("k", "v")
> val sums = x.groupBy($"k").agg(sum($"v") as "sum")
> x
> .join(sums, x("k") <=> sums("k"))
> .drop(sums("k"))
> .show
> {noformat}
> gives:
> {noformat}
> org.apache.spark.sql.AnalysisException: Detected cartesian product for INNER join between logical plans
> Project [_2#54 AS v#57]
> +- LocalRelation [_1#53, _2#54]
> and
> Aggregate [k#69], [k#69, sum(cast(v#70 as bigint)) AS sum#65L]
> +- Project [_1#53 AS k#69, _2#54 AS v#70]
> +- LocalRelation [_1#53, _2#54]
> Join condition is missing or trivial.
> Use the CROSS JOIN syntax to allow cartesian products between these relations.;
> at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts$$anonfun$apply$20.applyOrElse(Optimizer.scala:1081)
> at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts$$anonfun$apply$20.applyOrElse(Optimizer.scala:1078)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267)
> at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
> at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
> at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)
> at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
> at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:256)
> at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts.apply(Optimizer.scala:1078)
> at org.apache.spark.sql.catalyst.optimizer.CheckCartesianProducts.apply(Optimizer.scala:1063)
> at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
> at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
> at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
> at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
> at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
> at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
> at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
> at scala.collection.immutable.List.foreach(List.scala:381)
> at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
> at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:79)
> at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:79)
> at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:85)
> at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:81)
> at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:90)
> at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:90)
> at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2901)
> at org.apache.spark.sql.Dataset.head(Dataset.scala:2238)
> at org.apache.spark.sql.Dataset.take(Dataset.scala:2451)
> at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)
> at org.apache.spark.sql.Dataset.show(Dataset.scala:680)
> at org.apache.spark.sql.Dataset.show(Dataset.scala:639)
> at org.apache.spark.sql.Dataset.show(Dataset.scala:648)
> {noformat}
> but this works fine:
> {noformat}
> val x = Seq(("a", 1), ("a", 2), (null, 1)).toDF("k", "v")
> val sums = x.select($"k" as "k1", $"v").groupBy($"k1").agg(sum($"v") as "sum")
> x
> .join(sums, x("k") <=> sums("k1"))
> .drop(sums("k1"))
> .show
> +----+---+---+
> | k| v|sum|
> +----+---+---+
> | a| 1| 3|
> | a| 2| 3|
> |null| 1| 1|
> +----+---+---+
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org