You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Hyukjin Kwon (JIRA)" <ji...@apache.org> on 2016/07/25 00:46:20 UTC
[jira] [Commented] (SPARK-16698) json parsing regression - "." in keys

    [ https://issues.apache.org/jira/browse/SPARK-16698?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15391217#comment-15391217 ] 

Hyukjin Kwon commented on SPARK-16698:
--------------------------------------

FYI, this does not happen when it is read from json RDD. Let me please leave a code to reproduce that self-contains the issue.

This does not work (in `JsonSuite.scala`)

{code}
test("SPARK-16698 - json parsing regression - "." in keys") {
  withTempPath { path =>
    val json =""" {"a.b":"data"}"""
    spark.sparkContext
      .parallelize(json :: Nil)
      .saveAsTextFile(path.getAbsolutePath)
    spark.read.json(path.getAbsolutePath).collect()
  }
}
{code}

This works

{code}
test("SPARK-16698 - json parsing regression - "." in keys") {
  withTempPath { path =>
    val json =""" {"a.b":"data"}"""
    val rdd = spark.sparkContext
      .parallelize(json :: Nil)
    spark.read.json(rdd).collect()
  }
}
{code}


> json parsing regression - "." in keys
> -------------------------------------
>
>                 Key: SPARK-16698
>                 URL: https://issues.apache.org/jira/browse/SPARK-16698
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.0.0
>            Reporter: TobiasP
>
> The commit 83775bc78e183791f75a99cdfbcd68a67ca0d472 "\[SPARK-14158]\[SQL] implement buildReader for json data source" breaks parsing of json files with "." in keys.
> E.g. the test input for spark-solr https://github.com/lucidworks/spark-solr/blob/master/src/test/resources/test-data/events.json
> {noformat}
> scala> sqlContext.read.json("src/test/resources/test-data/events.json").collectAsList
> org.apache.spark.sql.AnalysisException: Unable to resolve params.title_s given [_version_, count_l, doc_id_s, flag_s, id, params.title_s, params.url_s, session_id_s, timestamp_tdt, type_s, tz_timestamp_txt, user_id_s];
>   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:131)
>   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1$$anonfun$apply$5.apply(LogicalPlan.scala:131)
>   at scala.Option.getOrElse(Option.scala:121)
>   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:130)
>   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$resolve$1.apply(LogicalPlan.scala:126)
>   at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
>   at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245)
>   at scala.collection.Iterator$class.foreach(Iterator.scala:742)
>   at scala.collection.AbstractIterator.foreach(Iterator.scala:1194)
>   at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
>   at org.apache.spark.sql.types.StructType.foreach(StructType.scala:94)
>   at scala.collection.TraversableLike$class.map(TraversableLike.scala:245)
>   at org.apache.spark.sql.types.StructType.map(StructType.scala:94)
>   at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:126)
>   at org.apache.spark.sql.execution.datasources.FileSourceStrategy$.apply(FileSourceStrategy.scala:80)
>   at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
>   at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
>   at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:396)
>   at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:59)
>   at org.apache.spark.sql.catalyst.planning.QueryPlanner.planLater(QueryPlanner.scala:54)
>   at org.apache.spark.sql.execution.SparkStrategies$SpecialLimits$.apply(SparkStrategies.scala:53)
>   at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
>   at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
>   at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:396)
>   at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:59)
>   at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:52)
>   at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:50)
>   at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:57)
>   at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:57)
>   at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2321)
>   at org.apache.spark.sql.Dataset.collectAsList(Dataset.scala:2040)
>   ... 49 elided
> {noformat}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org