You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Kaiyuan Yang (JIRA)" <ji...@apache.org> on 2016/01/11 13:58:39 UTC
[jira] [Commented] (SPARK-12066) spark sql throw
java.lang.ArrayIndexOutOfBoundsException when use table.* with join
[ https://issues.apache.org/jira/browse/SPARK-12066?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15091864#comment-15091864 ]
Kaiyuan Yang commented on SPARK-12066:
--------------------------------------
Dirty data could trigger a lot of “strange” exceptions, We should correct the data.
> spark sql throw java.lang.ArrayIndexOutOfBoundsException when use table.* with join
> -------------------------------------------------------------------------------------
>
> Key: SPARK-12066
> URL: https://issues.apache.org/jira/browse/SPARK-12066
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 1.4.0, 1.5.2
> Environment: linux
> Reporter: Ricky Yang
> Priority: Blocker
>
> throw java.lang.ArrayIndexOutOfBoundsException when I use following spark sql on spark standlone or yarn.
> the sql:
> select ta.*
> from bi_td.dm_price_seg_td tb
> join bi_sor.sor_ord_detail_tf ta
> on 1 = 1
> where ta.sale_dt = '20140514'
> and ta.sale_price >= tb.pri_from
> and ta.sale_price < tb.pri_to limit 10 ;
> But ,the result is correct when using no * as following:
> select ta.sale_dt
> from bi_td.dm_price_seg_td tb
> join bi_sor.sor_ord_detail_tf ta
> on 1 = 1
> where ta.sale_dt = '20140514'
> and ta.sale_price >= tb.pri_from
> and ta.sale_price < tb.pri_to limit 10 ;
> standlone version is 1.4.0 and version spark on yarn is 1.5.2
> error log :
>
> 15/11/30 14:19:59 ERROR SparkSQLDriver: Failed in [select ta.*
> from bi_td.dm_price_seg_td tb
> join bi_sor.sor_ord_detail_tf ta
> on 1 = 1
> where ta.sale_dt = '20140514'
> and ta.sale_price >= tb.pri_from
> and ta.sale_price < tb.pri_to limit 10 ]
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, namenode2-sit.cnsuning.com): java.lang.ArrayIndexOutOfBoundsException
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
> at scala.Option.foreach(Option.scala:236)
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1850)
> at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:215)
> at org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:207)
> at org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:587)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:63)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:308)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:311)
> at org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:409)
> at org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:425)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:166)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> Caused by: java.lang.ArrayIndexOutOfBoundsException
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, namenode2-sit.cnsuning.com): java.lang.ArrayIndexOutOfBoundsException
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
> at scala.Option.foreach(Option.scala:236)
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1850)
> at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:215)
> at org.apache.spark.sql.execution.Limit.executeCollect(basicOperators.scala:207)
> at org.apache.spark.sql.hive.HiveContext$QueryExecution.stringResult(HiveContext.scala:587)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLDriver.run(SparkSQLDriver.scala:63)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.processCmd(SparkSQLCLIDriver.scala:308)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:311)
> at org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:409)
> at org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:425)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver$.main(SparkSQLCLIDriver.scala:166)
> at org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver.main(SparkSQLCLIDriver.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> Caused by: java.lang.ArrayIndexOutOfBoundsException
>
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org