You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Louis Salin (JIRA)" <ji...@apache.org> on 2016/08/25 14:37:22 UTC
[jira] [Closed] (SPARK-17232) Expecting same behavior after loading
a dataframe with dots in column name
[ https://issues.apache.org/jira/browse/SPARK-17232?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Louis Salin closed SPARK-17232.
-------------------------------
Resolution: Fixed
Fixed in the master branch
> Expecting same behavior after loading a dataframe with dots in column name
> --------------------------------------------------------------------------
>
> Key: SPARK-17232
> URL: https://issues.apache.org/jira/browse/SPARK-17232
> Project: Spark
> Issue Type: Bug
> Affects Versions: 2.0.0
> Reporter: Louis Salin
>
> In Spark 2.0, the behavior of a dataframe changes after saving and reloading it when there are dots in the column names. In the example below, I was able to call the {{rdd}} function for a newly created dataframe. However, after saving it and reloading it, an exception gets thrown when calling the {{rdd}} function.
> from a spark-shell:
> {{scala> val simpleDf = Seq((1, 2)).toDF("a.b", "a.c")}}
> Res1: org.apache.spark.sql.DataFrame = \[a.b: int, a.c: int\]
> {{scala> simpleDf.rdd}}
> Res2: org.apache.spark.rdd.RDD\[org.apache.spark.sql.Row\] = MapPartitionsRDD\[7\] at rdd at <console>:29
> {{scala> simpleDf.write.parquet("/user/lsalin/simpleDf")}}
> {{scala> val readDf = spark.read.parquet("/user/lsalin/simpleDf")}}
> Res4: org.apache.spark.sql.DataFrame = \[a.b: int, a.c: int\]
> {{scala> readDf.rdd}}
> {noformat}
> org.apache.spark.sql.AnalysisException: Unable to resolve a.b given [a.b, a.c];
> at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134)
> at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:134)
> at scala.Option.getOrElse(Option.scala:121)
> at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:133)
> at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:129)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
> at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
> at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
> at org.apache.spark.sql.types.StructType.foreach(StructType.scala:95)
> at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
> at org.apache.spark.sql.types.StructType.map(StructType.scala:95)
> at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:129)
> at org.apache.spark.sql.execution.datasources.FileSourceStrategy$.apply(FileSourceStrategy.scala:87)
> at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60)
> at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:60)
> at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
> at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:61)
> at org.apache.spark.sql.execution.SparkPlanner.plan(SparkPlanner.scala:47)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:51)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:48)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
> at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:300)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:298)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:321)
> at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:179)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:319)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:298)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:51)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1$$anonfun$apply$1.applyOrElse(SparkPlanner.scala:48)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
> at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:301)
> at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69)
> at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:300)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
> at org.apache.spark.sql.execution.SparkPlanner$$anonfun$plan$1.apply(SparkPlanner.scala:48)
> at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
> at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:78)
> at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:76)
> at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:83)
> at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:83)
> at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:86)
> at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:86)
> at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:2347)
> at org.apache.spark.sql.Dataset.rdd(Dataset.scala:2344)
> ... 50 elided
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org