You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Zhongshuai Pei (JIRA)" <ji...@apache.org> on 2015/06/18 15:49:00 UTC
[jira] [Updated] (SPARK-8441) ArrayIndexOutOfBoundsException when
select all fields of a parqurt table with nested schema
[ https://issues.apache.org/jira/browse/SPARK-8441?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Zhongshuai Pei updated SPARK-8441:
----------------------------------
Description:
Table Info :
{noformat}
create table t1(
key1 int,
key2 String,
key3 int,
key4 array<strcut<a:smallint,b:int,c:int,d:string,e:int,f:string,g:int,h:string,i:string,j:smallint,k:smallint,h:string,i:bigint,j:bigint,k:smallint,l:bigint>>,
key5 array<strcut<a1:smallint,b1:int,c:int,d1:string,e1:int,f1:string,g1:int,h1:string,i1:string,j1:smallint,k1:smallint,h1:string,i1:bigint,j1:bigint,k1:smallint,l1:bigint>>)
partitioned by (hour int,last_num string)
stored as parquet
{noformat}
Question :
{noformat}
When i set spark.sql.hive.convertMetastoreParquet=true, and use sql "select * from table t1", then i get a exception :
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 4, 169.10.35.34): parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file hdfs://169.10.35.33:9000/user/hive/warehouse/cdr_voice_call/hour=10/last_num=0/000017_0
at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:213)
at parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:204)
at org.apache.spark.sql.sources.SqlNewHadoopRDD$$anon$1.hasNext(SqlNewHadoopRDD.scala:163)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
at java.lang.Thread.run(Thread.java:722)
Caused by: java.lang.ArrayIndexOutOfBoundsException: -1
at java.util.ArrayList.elementData(ArrayList.java:371)
at java.util.ArrayList.get(ArrayList.java:384)
at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
at parquet.io.PrimitiveColumnIO.getLast(PrimitiveColumnIO.java:80)
at parquet.io.PrimitiveColumnIO.isLast(PrimitiveColumnIO.java:74)
at parquet.io.RecordReaderImplementation.<init>(RecordReaderImplementation.java:290)
at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:131)
at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:96)
at parquet.filter2.compat.FilterCompat$NoOpFilter.accept(FilterCompat.java:136)
at parquet.io.MessageColumnIO.getRecordReader(MessageColumnIO.java:96)
at parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:126)
at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:193)
... 28 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 4, 169.10.35.34): parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file hdfs://169.10.35.33:9000/user/hive/warehouse/cdr_voice_call/hour=10/last_num=0/000017_0
at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:213)
at parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:204)
at org.apache.spark.sql.sources.SqlNewHadoopRDD$$anon$1.hasNext(SqlNewHadoopRDD.scala:163)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
at java.lang.Thread.run(Thread.java:722)
Caused by: java.lang.ArrayIndexOutOfBoundsException: -1
at java.util.ArrayList.elementData(ArrayList.java:371)
at java.util.ArrayList.get(ArrayList.java:384)
at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
at parquet.io.PrimitiveColumnIO.getLast(PrimitiveColumnIO.java:80)
at parquet.io.PrimitiveColumnIO.isLast(PrimitiveColumnIO.java:74)
at parquet.io.RecordReaderImplementation.<init>(RecordReaderImplementation.java:290)
at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:131)
at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:96)
at parquet.filter2.compat.FilterCompat$NoOpFilter.accept(FilterCompat.java:136)
at parquet.io.MessageColumnIO.getRecordReader(MessageColumnIO.java:96)
at parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:126)
at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:193)
... 28 more
{noformat}
> ArrayIndexOutOfBoundsException when select all fields of a parqurt table with nested schema
> --------------------------------------------------------------------------------------------
>
> Key: SPARK-8441
> URL: https://issues.apache.org/jira/browse/SPARK-8441
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 1.4.0
> Reporter: Zhongshuai Pei
>
> Table Info :
> {noformat}
> create table t1(
> key1 int,
> key2 String,
> key3 int,
> key4 array<strcut<a:smallint,b:int,c:int,d:string,e:int,f:string,g:int,h:string,i:string,j:smallint,k:smallint,h:string,i:bigint,j:bigint,k:smallint,l:bigint>>,
> key5 array<strcut<a1:smallint,b1:int,c:int,d1:string,e1:int,f1:string,g1:int,h1:string,i1:string,j1:smallint,k1:smallint,h1:string,i1:bigint,j1:bigint,k1:smallint,l1:bigint>>)
> partitioned by (hour int,last_num string)
> stored as parquet
> {noformat}
> Question :
> {noformat}
> When i set spark.sql.hive.convertMetastoreParquet=true, and use sql "select * from table t1", then i get a exception :
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 4, 169.10.35.34): parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file hdfs://169.10.35.33:9000/user/hive/warehouse/cdr_voice_call/hour=10/last_num=0/000017_0
> at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:213)
> at parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:204)
> at org.apache.spark.sql.sources.SqlNewHadoopRDD$$anon$1.hasNext(SqlNewHadoopRDD.scala:163)
> at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
> at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
> at java.lang.Thread.run(Thread.java:722)
> Caused by: java.lang.ArrayIndexOutOfBoundsException: -1
> at java.util.ArrayList.elementData(ArrayList.java:371)
> at java.util.ArrayList.get(ArrayList.java:384)
> at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
> at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
> at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
> at parquet.io.PrimitiveColumnIO.getLast(PrimitiveColumnIO.java:80)
> at parquet.io.PrimitiveColumnIO.isLast(PrimitiveColumnIO.java:74)
> at parquet.io.RecordReaderImplementation.<init>(RecordReaderImplementation.java:290)
> at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:131)
> at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:96)
> at parquet.filter2.compat.FilterCompat$NoOpFilter.accept(FilterCompat.java:136)
> at parquet.io.MessageColumnIO.getRecordReader(MessageColumnIO.java:96)
> at parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:126)
> at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:193)
> ... 28 more
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1266)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1257)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1256)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1256)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
> at scala.Option.foreach(Option.scala:236)
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1450)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1411)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost task 0.3 in stage 1.0 (TID 4, 169.10.35.34): parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file hdfs://169.10.35.33:9000/user/hive/warehouse/cdr_voice_call/hour=10/last_num=0/000017_0
> at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:213)
> at parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:204)
> at org.apache.spark.sql.sources.SqlNewHadoopRDD$$anon$1.hasNext(SqlNewHadoopRDD.scala:163)
> at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:308)
> at scala.collection.Iterator$class.foreach(Iterator.scala:727)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
> at scala.collection.AbstractIterator.to(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
> at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
> at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
> at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
> at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1765)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> at org.apache.spark.scheduler.Task.run(Task.scala:70)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603)
> at java.lang.Thread.run(Thread.java:722)
> Caused by: java.lang.ArrayIndexOutOfBoundsException: -1
> at java.util.ArrayList.elementData(ArrayList.java:371)
> at java.util.ArrayList.get(ArrayList.java:384)
> at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
> at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
> at parquet.io.GroupColumnIO.getLast(GroupColumnIO.java:95)
> at parquet.io.PrimitiveColumnIO.getLast(PrimitiveColumnIO.java:80)
> at parquet.io.PrimitiveColumnIO.isLast(PrimitiveColumnIO.java:74)
> at parquet.io.RecordReaderImplementation.<init>(RecordReaderImplementation.java:290)
> at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:131)
> at parquet.io.MessageColumnIO$1.visit(MessageColumnIO.java:96)
> at parquet.filter2.compat.FilterCompat$NoOpFilter.accept(FilterCompat.java:136)
> at parquet.io.MessageColumnIO.getRecordReader(MessageColumnIO.java:96)
> at parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:126)
> at parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:193)
> ... 28 more
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org