You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Jason C Lee (JIRA)" <ji...@apache.org> on 2015/10/06 02:22:26 UTC

[jira] [Commented] (SPARK-10925) Exception when joining DataFrames

    [ https://issues.apache.org/jira/browse/SPARK-10925?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14944303#comment-14944303 ] 

Jason C Lee commented on SPARK-10925:
-------------------------------------

I removed your 2nd step "apply an UDF on column "name"" and was able to also recreate the problem. I reduced your test case to the following:

import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._

object TestCase2 {

  case class Individual(id: String, name: String, surname: String, birthDate: String)

  def main(args: Array[String]) {

    val sc = new SparkContext("local", "join DFs")
    val sqlContext = new SQLContext(sc)

    val rdd = sc.parallelize(Seq(
      Individual("000014", "patrick", "andrews", "10/10/1970")
    ))

    val df = sqlContext.createDataFrame(rdd)
    df.show()

    val df1 = df;
    val df2 = df1.withColumn("surname1", df("surname"))
    df2.show()

    val df3 = df2.withColumn("birthDate1", df("birthDate"))
    df3.show()

    val cardinalityDF1 = df3.groupBy("name")
      .agg(count("name").as("cardinality_name"))
        cardinalityDF1.show()

    val df4 = df3.join(cardinalityDF1, df3("name") === cardinalityDF1("name"))
    df4.show()

    val cardinalityDF2 = df4.groupBy("surname1")
      .agg(count("surname1").as("cardinality_surname"))
    cardinalityDF2.show()

    val df5 = df4.join(cardinalityDF2, df4("surname") === cardinalityDF2("surname1"))
    df5.show()
  }
}

> Exception when joining DataFrames
> ---------------------------------
>
>                 Key: SPARK-10925
>                 URL: https://issues.apache.org/jira/browse/SPARK-10925
>             Project: Spark
>          Issue Type: Bug
>    Affects Versions: 1.5.0, 1.5.1
>         Environment: Tested with Spark 1.5.0 and Spark 1.5.1
>            Reporter: Alexis Seigneurin
>         Attachments: Photo 05-10-2015 14 31 16.jpg, TestCase2.scala
>
>
> I get an exception when joining a DataFrame with another DataFrame. The second DataFrame was created by performing an aggregation on the first DataFrame.
> My complete workflow is:
> # read the DataFrame
> # apply an UDF on column "name"
> # apply an UDF on column "surname"
> # apply an UDF on column "birthDate"
> # aggregate on "name" and re-join with the DF
> # aggregate on "surname" and re-join with the DF
> If I remove one step, the process completes normally.
> Here is the exception:
> {code}
> Exception in thread "main" org.apache.spark.sql.AnalysisException: resolved attribute(s) surname#20 missing from id#0,birthDate#3,name#10,surname#7 in operator !Project [id#0,birthDate#3,name#10,surname#20,UDF(birthDate#3) AS birthDate_cleaned#8];
> 	at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:37)
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:44)
> 	at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:154)
> 	at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:49)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:103)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at scala.collection.immutable.List.foreach(List.scala:318)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at scala.collection.immutable.List.foreach(List.scala:318)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at scala.collection.immutable.List.foreach(List.scala:318)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at scala.collection.immutable.List.foreach(List.scala:318)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:102)
> 	at scala.collection.immutable.List.foreach(List.scala:318)
> 	at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:102)
> 	at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:49)
> 	at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:44)
> 	at org.apache.spark.sql.SQLContext$QueryExecution.assertAnalyzed(SQLContext.scala:914)
> 	at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:132)
> 	at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$logicalPlanToDataFrame(DataFrame.scala:154)
> 	at org.apache.spark.sql.DataFrame.join(DataFrame.scala:553)
> 	at org.apache.spark.sql.DataFrame.join(DataFrame.scala:520)
> 	at TestCase2$.main(TestCase2.scala:51)
> 	at TestCase2.main(TestCase2.scala)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> 	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> 	at java.lang.reflect.Method.invoke(Method.java:497)
> 	at com.intellij.rt.execution.application.AppMain.main(AppMain.java:140)
> {code}
> I'm attaching a test case that I tried with Spark 1.5.0 and 1.5.1. Please note it used to work with version 1.4.1



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org