You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Yin Huai (JIRA)" <ji...@apache.org> on 2015/09/15 23:26:46 UTC
[jira] [Resolved] (SPARK-4278) SparkSQL job failing with
java.lang.ClassCastException
[ https://issues.apache.org/jira/browse/SPARK-4278?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Yin Huai resolved SPARK-4278.
-----------------------------
Resolution: Fixed
Assignee: Yin Huai
Fix Version/s: 1.5.0
In Spark 1.5, if the type of data does not match the schema, we will throw a run time exception saying that we cannot parse a value as the type defined in the schema and the error message also contains the token we got from Jackson parser. I think it is a better error than a ClassCastException. So, I am resolving this jira.
> SparkSQL job failing with java.lang.ClassCastException
> ------------------------------------------------------
>
> Key: SPARK-4278
> URL: https://issues.apache.org/jira/browse/SPARK-4278
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Reporter: Parviz Deyhim
> Assignee: Yin Huai
> Fix For: 1.5.0
>
>
> The following job fails with the java.lang.ClassCastException error. Ideally SparkSQL should have the ability to ignore records that don't conform with the inferred schema.
> The steps that gets me to this error:
> 1) infer schema from a small subset of data
> 2) apply the schema to a larger dataset
> 3) do a simple join of two datasets
> sample code:
> {code}
> val sampleJson = sqlContext.jsonRDD(sc.textFile(".../dt=2014-10-10/file.snappy"))
> val mydata = sqlContext.jsonRDD(larger_dataset,sampleJson.schema)
> mydata.registerTempTable("mytable1")
> other dataset:
> val x = sc.textFile(".....")
> case class Dataset(a:String,state:String, b:String, z:String, c:String, d:String)
> val xSchemaRDD = x.map(_.split("\t")).map(f=>Dataset(f(0),f(1),f(2),f(3),f(4),f(5)))
> xSchemaRDD.registerTempTable("mytable2")
> {code}
> java.lang.ClassCastException: java.lang.Long cannot be cast to java.lang.Integer
> scala.runtime.BoxesRunTime.unboxToInt(BoxesRunTime.java:106)
> org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:389)
> org.apache.spark.sql.json.JsonRDD$$anonfun$enforceCorrectType$1.apply(JsonRDD.scala:397)
> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
> scala.collection.AbstractTraversable.map(Traversable.scala:105)
> org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:397)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
> scala.Option.map(Option.scala:145)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
> org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:398)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
> scala.Option.map(Option.scala:145)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
> org.apache.spark.sql.json.JsonRDD$.enforceCorrectType(JsonRDD.scala:398)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1$$anonfun$apply$4.apply(JsonRDD.scala:410)
> scala.Option.map(Option.scala:145)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:409)
> org.apache.spark.sql.json.JsonRDD$$anonfun$org$apache$spark$sql$json$JsonRDD$$asRow$1.apply(JsonRDD.scala:407)
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> org.apache.spark.sql.json.JsonRDD$.org$apache$spark$sql$json$JsonRDD$$asRow(JsonRDD.scala:407)
> org.apache.spark.sql.json.JsonRDD$$anonfun$jsonStringToRow$1.apply(JsonRDD.scala:41)
> org.apache.spark.sql.json.JsonRDD$$anonfun$jsonStringToRow$1.apply(JsonRDD.scala:41)
> scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
> scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:389)
> scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:209)
> org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:65)
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
> org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
> org.apache.spark.scheduler.Task.run(Task.scala:56)
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:182)
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> java.lang.Thread.run(Thread.java:724)
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org