You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@spark.apache.org by Alexander Pivovarov <ap...@gmail.com> on 2015/12/20 19:42:57 UTC
Spark fails after 6000s because of akka
I run Spark 1.5.2 on YARN (EMR)
I noticed that my long running jobs always failed after 1h 40 min (6000s)
with the exceptions below.
Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default
I changed the settings to the following and it solve my issue.
"spark.akka.heartbeat.pauses": "60000s",
"spark.akka.heartbeat.interval": "10000s"
RROR ErrorMonitor - Uncaught fatal error from thread
[sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down
ActorSystem [sparkDriver]
java.lang.OutOfMemoryError: Java heap space
at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
at akka.actor.ActorCell.invoke(ActorCell.scala:487)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
at akka.dispatch.Mailbox.run(Mailbox.scala:220)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
ERROR ActorSystemImpl - Uncaught fatal error from thread
[sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down
ActorSystem [sparkDriver]
java.lang.OutOfMemoryError: Java heap space
at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
at akka.actor.ActorCell.invoke(ActorCell.scala:487)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
at akka.dispatch.Mailbox.run(Mailbox.scala:220)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
ERROR ActorSystemImpl - Uncaught fatal error from thread
[sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down
ActorSystem [sparkDriver]
java.lang.OutOfMemoryError: Java heap space
at java.util.Arrays.copyOf(Arrays.java:2271)
at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
at akka.actor.ActorCell.invoke(ActorCell.scala:487)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
at akka.dispatch.Mailbox.run(Mailbox.scala:220)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
ERROR ActorSystemImpl - Uncaught fatal error from thread
[sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down
ActorSystem [sparkDriver]
java.lang.OutOfMemoryError: Java heap space
at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
at akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
at akka.actor.ActorCell.invoke(ActorCell.scala:487)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
at akka.dispatch.Mailbox.run(Mailbox.scala:220)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Re: Spark fails after 6000s because of akka
Posted by Alexander Pivovarov <ap...@gmail.com>.
Documentation says that this setting is used to disable Akka transport
failure detector.
Why magic number 6000s is used then?
It should be maximum possible number instead of 6000s to disable heartbeat
Using magic numbers like 1 hour and 40 min creates issues which are
difficult to debug. Most probably all Spark integration tests run faster
that 6000s and this area is basically not tested.
Anyone know why spark.akka.heartbeat.pauses=6000s ???
On Sun, Dec 20, 2015 at 9:40 PM, Josh Rosen <jo...@databricks.com>
wrote:
> Would you mind copying this information into a JIRA ticket to make it
> easier to discover / track? Thanks!
>
> On Sun, Dec 20, 2015 at 11:35 AM Alexander Pivovarov <ap...@gmail.com>
> wrote:
>
>> Usually Spark EMR job fails with the following exception in 1 hour 40 min
>> - Job cancelled because SparkContext was shut down
>>
>> java.util.concurrent.RejectedExecutionException: Task scala.concurrent.impl.CallbackRunnable@2d602a14 rejected from java.util.concurrent.ThreadPoolExecutor@46a9e52[Terminated, pool size = 0, active threads = 0, queued tasks = 0, completed tasks = 6294]
>> at java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2048)
>> at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
>> at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
>> at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
>> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
>> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
>> at scala.concurrent.Promise$class.complete(Promise.scala:55)
>> at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
>> at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324)
>> at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324)
>> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
>> at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
>> at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
>> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
>> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
>> at scala.concurrent.Promise$class.complete(Promise.scala:55)
>> at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
>> at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235)
>> at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235)
>> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
>> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.processBatch$1(Future.scala:643)
>> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply$mcV$sp(Future.scala:658)
>> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635)
>> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635)
>> at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
>> at scala.concurrent.Future$InternalCallbackExecutor$Batch.run(Future.scala:634)
>> at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
>> at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:685)
>> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
>> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
>> at scala.concurrent.Promise$class.complete(Promise.scala:55)
>> at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
>> at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249)
>> at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249)
>> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
>> at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
>> at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
>> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
>> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
>> at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
>> at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117)
>> at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
>> at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
>> at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467)
>> at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419)
>> at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423)
>> at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375)
>> at java.lang.Thread.run(Thread.java:745)
>> Exception in thread "main" org.apache.spark.SparkException: Job cancelled because SparkContext was shut down
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703)
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702)
>> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
>> at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702)
>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514)
>> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
>> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438)
>> at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724)
>> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
>> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723)
>> at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146)
>> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
>> at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1063)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.RDD.fold(RDD.scala:1057)
>> at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply$mcD$sp(DoubleRDDFunctions.scala:34)
>> at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34)
>> at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:33)
>> at com.radius.core.util.SparkUtils$.estimateNewPartitionsNum(SparkUtils.scala:41)
>> at com.radius.core.util.SparkUtils$.coalesceRdd(SparkUtils.scala:35)
>> at com.radius.distiller.Distiller.saveExtract(Distiller.scala:75)
>> at com.radius.distiller.Execute$.run(Execute.scala:55)
>> at com.radius.distiller.Execute$.main(Execute.scala:29)
>> at com.radius.distiller.Execute.main(Execute.scala)
>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>> at java.lang.reflect.Method.invoke(Method.java:606)
>> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
>> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>> Command exiting with ret '1'
>>
>>
>> On Sun, Dec 20, 2015 at 11:29 AM, Alexander Pivovarov <
>> apivovarov@gmail.com> wrote:
>>
>>> Or this message
>>>
>>> Exception in thread "main" org.apache.spark.SparkException: Job cancelled because SparkContext was shut down
>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703)
>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702)
>>> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
>>> at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702)
>>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514)
>>> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
>>> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438)
>>> at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724)
>>> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
>>> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723)
>>> at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146)
>>> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
>>> at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
>>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
>>> at com.radius.distiller.Execute$.run(Execute.scala:56)
>>> at com.radius.distiller.Execute$.main(Execute.scala:33)
>>> at com.radius.distiller.Execute.main(Execute.scala)
>>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>>> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>>> at java.lang.reflect.Method.invoke(Method.java:606)
>>> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
>>> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
>>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
>>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
>>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>>>
>>>
>>> On Sun, Dec 20, 2015 at 11:28 AM, Alexander Pivovarov <
>>> apivovarov@gmail.com> wrote:
>>>
>>>> it can also fail with the following message
>>>>
>>>> Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 133 in stage 33.1 failed 4 times, most recent failure: Lost task 133.3 in stage 33.1 (TID 172737, ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
>>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
>>>> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
>>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
>>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
>>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
>>>> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
>>>> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>>>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>>>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>>>> at java.lang.Thread.run(Thread.java:745)
>>>> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
>>>> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
>>>> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
>>>> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
>>>> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
>>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>>>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>>>> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>>>> ... 1 more
>>>>
>>>> Driver stacktrace:
>>>> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
>>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
>>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
>>>> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>>>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>>>> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
>>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
>>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
>>>> at scala.Option.foreach(Option.scala:236)
>>>> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
>>>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
>>>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
>>>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
>>>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>>> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
>>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
>>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
>>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
>>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
>>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
>>>> at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
>>>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
>>>> at com.radius.distiller.Execute$.run(Execute.scala:56)
>>>> at com.radius.distiller.Execute$.main(Execute.scala:33)
>>>> at com.radius.distiller.Execute.main(Execute.scala)
>>>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>>>> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>>>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>>>> at java.lang.reflect.Method.invoke(Method.java:606)
>>>> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
>>>> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
>>>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
>>>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
>>>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>>>> Caused by: java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
>>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
>>>> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
>>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
>>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
>>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
>>>> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
>>>> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>>>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>>>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>>>> at java.lang.Thread.run(Thread.java:745)
>>>> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
>>>> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
>>>> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
>>>> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
>>>> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
>>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>>>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>>>> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>>>> ... 1 more
>>>>
>>>>
>>>> On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <
>>>> apivovarov@gmail.com> wrote:
>>>>
>>>>> I run Spark 1.5.2 on YARN (EMR)
>>>>>
>>>>> I noticed that my long running jobs always failed after 1h 40 min
>>>>> (6000s) with the exceptions below.
>>>>>
>>>>> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by
>>>>> default
>>>>>
>>>>> I changed the settings to the following and it solve my issue.
>>>>>
>>>>> "spark.akka.heartbeat.pauses": "60000s",
>>>>> "spark.akka.heartbeat.interval": "10000s"
>>>>>
>>>>>
>>>>>
>>>>> RROR ErrorMonitor - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>>>> java.lang.OutOfMemoryError: Java heap space
>>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>>>> java.lang.OutOfMemoryError: Java heap space
>>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down ActorSystem [sparkDriver]
>>>>> java.lang.OutOfMemoryError: Java heap space
>>>>> at java.util.Arrays.copyOf(Arrays.java:2271)
>>>>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
>>>>> at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
>>>>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
>>>>> at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
>>>>> at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
>>>>> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
>>>>> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
>>>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
>>>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>>>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>>> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
>>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>>>> java.lang.OutOfMemoryError: Java heap space
>>>>> at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
>>>>> at akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
>>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
>>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>>>
>>>>> at
>>>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>>>
>>>>>
>>>>
>>>>
>>>
>>
Re: Spark fails after 6000s because of akka
Posted by Josh Rosen <jo...@databricks.com>.
Would you mind copying this information into a JIRA ticket to make it
easier to discover / track? Thanks!
On Sun, Dec 20, 2015 at 11:35 AM Alexander Pivovarov <ap...@gmail.com>
wrote:
> Usually Spark EMR job fails with the following exception in 1 hour 40 min
> - Job cancelled because SparkContext was shut down
>
> java.util.concurrent.RejectedExecutionException: Task scala.concurrent.impl.CallbackRunnable@2d602a14 rejected from java.util.concurrent.ThreadPoolExecutor@46a9e52[Terminated, pool size = 0, active threads = 0, queued tasks = 0, completed tasks = 6294]
> at java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2048)
> at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
> at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
> at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
> at scala.concurrent.Promise$class.complete(Promise.scala:55)
> at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
> at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324)
> at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324)
> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
> at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
> at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
> at scala.concurrent.Promise$class.complete(Promise.scala:55)
> at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
> at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235)
> at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235)
> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.processBatch$1(Future.scala:643)
> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply$mcV$sp(Future.scala:658)
> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635)
> at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635)
> at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
> at scala.concurrent.Future$InternalCallbackExecutor$Batch.run(Future.scala:634)
> at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
> at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:685)
> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
> at scala.concurrent.Promise$class.complete(Promise.scala:55)
> at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
> at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249)
> at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249)
> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
> at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
> at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
> at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
> at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
> at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
> at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117)
> at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
> at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
> at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467)
> at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419)
> at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423)
> at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375)
> at java.lang.Thread.run(Thread.java:745)
> Exception in thread "main" org.apache.spark.SparkException: Job cancelled because SparkContext was shut down
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702)
> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
> at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514)
> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438)
> at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724)
> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723)
> at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
> at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1063)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.RDD.fold(RDD.scala:1057)
> at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply$mcD$sp(DoubleRDDFunctions.scala:34)
> at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34)
> at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:33)
> at com.radius.core.util.SparkUtils$.estimateNewPartitionsNum(SparkUtils.scala:41)
> at com.radius.core.util.SparkUtils$.coalesceRdd(SparkUtils.scala:35)
> at com.radius.distiller.Distiller.saveExtract(Distiller.scala:75)
> at com.radius.distiller.Execute$.run(Execute.scala:55)
> at com.radius.distiller.Execute$.main(Execute.scala:29)
> at com.radius.distiller.Execute.main(Execute.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> Command exiting with ret '1'
>
>
> On Sun, Dec 20, 2015 at 11:29 AM, Alexander Pivovarov <
> apivovarov@gmail.com> wrote:
>
>> Or this message
>>
>> Exception in thread "main" org.apache.spark.SparkException: Job cancelled because SparkContext was shut down
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703)
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702)
>> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
>> at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702)
>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514)
>> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
>> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438)
>> at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724)
>> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
>> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723)
>> at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146)
>> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
>> at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
>> at com.radius.distiller.Execute$.run(Execute.scala:56)
>> at com.radius.distiller.Execute$.main(Execute.scala:33)
>> at com.radius.distiller.Execute.main(Execute.scala)
>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>> at java.lang.reflect.Method.invoke(Method.java:606)
>> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
>> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>>
>>
>> On Sun, Dec 20, 2015 at 11:28 AM, Alexander Pivovarov <
>> apivovarov@gmail.com> wrote:
>>
>>> it can also fail with the following message
>>>
>>> Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 133 in stage 33.1 failed 4 times, most recent failure: Lost task 133.3 in stage 33.1 (TID 172737, ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
>>> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
>>> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
>>> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>>> at java.lang.Thread.run(Thread.java:745)
>>> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
>>> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
>>> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
>>> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
>>> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>>> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>>> ... 1 more
>>>
>>> Driver stacktrace:
>>> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
>>> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>>> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
>>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
>>> at scala.Option.foreach(Option.scala:236)
>>> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
>>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
>>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
>>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
>>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
>>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
>>> at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
>>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
>>> at com.radius.distiller.Execute$.run(Execute.scala:56)
>>> at com.radius.distiller.Execute$.main(Execute.scala:33)
>>> at com.radius.distiller.Execute.main(Execute.scala)
>>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>>> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>>> at java.lang.reflect.Method.invoke(Method.java:606)
>>> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
>>> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
>>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
>>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
>>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>>> Caused by: java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
>>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
>>> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
>>> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
>>> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
>>> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>>> at java.lang.Thread.run(Thread.java:745)
>>> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
>>> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
>>> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
>>> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
>>> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>>> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>>> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>>> ... 1 more
>>>
>>>
>>> On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <
>>> apivovarov@gmail.com> wrote:
>>>
>>>> I run Spark 1.5.2 on YARN (EMR)
>>>>
>>>> I noticed that my long running jobs always failed after 1h 40 min
>>>> (6000s) with the exceptions below.
>>>>
>>>> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default
>>>>
>>>> I changed the settings to the following and it solve my issue.
>>>>
>>>> "spark.akka.heartbeat.pauses": "60000s",
>>>> "spark.akka.heartbeat.interval": "10000s"
>>>>
>>>>
>>>>
>>>> RROR ErrorMonitor - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>>> java.lang.OutOfMemoryError: Java heap space
>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>>> java.lang.OutOfMemoryError: Java heap space
>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down ActorSystem [sparkDriver]
>>>> java.lang.OutOfMemoryError: Java heap space
>>>> at java.util.Arrays.copyOf(Arrays.java:2271)
>>>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
>>>> at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
>>>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
>>>> at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
>>>> at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
>>>> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
>>>> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
>>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
>>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
>>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>>> java.lang.OutOfMemoryError: Java heap space
>>>> at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
>>>> at akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
>>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
>>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>>
>>>> at
>>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>>
>>>>
>>>
>>>
>>
>
Re: Spark fails after 6000s because of akka
Posted by Alexander Pivovarov <ap...@gmail.com>.
Usually Spark EMR job fails with the following exception in 1 hour 40 min - Job
cancelled because SparkContext was shut down
java.util.concurrent.RejectedExecutionException: Task
scala.concurrent.impl.CallbackRunnable@2d602a14 rejected from
java.util.concurrent.ThreadPoolExecutor@46a9e52[Terminated, pool size
= 0, active threads = 0, queued tasks = 0, completed tasks = 6294]
at java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2048)
at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324)
at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235)
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.processBatch$1(Future.scala:643)
at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply$mcV$sp(Future.scala:658)
at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635)
at scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
at scala.concurrent.Future$InternalCallbackExecutor$Batch.run(Future.scala:634)
at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:685)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at scala.concurrent.Promise$class.complete(Promise.scala:55)
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153)
at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249)
at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32)
at org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293)
at scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248)
at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117)
at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467)
at akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419)
at akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423)
at akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375)
at java.lang.Thread.run(Thread.java:745)
Exception in thread "main" org.apache.spark.SparkException: Job
cancelled because SparkContext was shut down
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702)
at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514)
at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438)
at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1723)
at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1063)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.RDD.fold(RDD.scala:1057)
at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply$mcD$sp(DoubleRDDFunctions.scala:34)
at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34)
at org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:33)
at com.radius.core.util.SparkUtils$.estimateNewPartitionsNum(SparkUtils.scala:41)
at com.radius.core.util.SparkUtils$.coalesceRdd(SparkUtils.scala:35)
at com.radius.distiller.Distiller.saveExtract(Distiller.scala:75)
at com.radius.distiller.Execute$.run(Execute.scala:55)
at com.radius.distiller.Execute$.main(Execute.scala:29)
at com.radius.distiller.Execute.main(Execute.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Command exiting with ret '1'
On Sun, Dec 20, 2015 at 11:29 AM, Alexander Pivovarov <ap...@gmail.com>
wrote:
> Or this message
>
> Exception in thread "main" org.apache.spark.SparkException: Job cancelled because SparkContext was shut down
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702)
> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
> at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514)
> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438)
> at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724)
> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723)
> at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
> at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
> at com.radius.distiller.Execute$.run(Execute.scala:56)
> at com.radius.distiller.Execute$.main(Execute.scala:33)
> at com.radius.distiller.Execute.main(Execute.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>
>
> On Sun, Dec 20, 2015 at 11:28 AM, Alexander Pivovarov <
> apivovarov@gmail.com> wrote:
>
>> it can also fail with the following message
>>
>> Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 133 in stage 33.1 failed 4 times, most recent failure: Lost task 133.3 in stage 33.1 (TID 172737, ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
>> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
>> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
>> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
>> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>> at java.lang.Thread.run(Thread.java:745)
>> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
>> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
>> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
>> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
>> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
>> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>> ... 1 more
>>
>> Driver stacktrace:
>> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
>> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
>> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
>> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
>> at scala.Option.foreach(Option.scala:236)
>> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
>> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
>> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
>> at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
>> at com.radius.distiller.Execute$.run(Execute.scala:56)
>> at com.radius.distiller.Execute$.main(Execute.scala:33)
>> at com.radius.distiller.Execute.main(Execute.scala)
>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>> at java.lang.reflect.Method.invoke(Method.java:606)
>> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
>> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>> Caused by: java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
>> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
>> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
>> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
>> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
>> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
>> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
>> at java.lang.Thread.run(Thread.java:745)
>> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
>> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
>> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
>> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
>> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
>> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>> ... 1 more
>>
>>
>> On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <
>> apivovarov@gmail.com> wrote:
>>
>>> I run Spark 1.5.2 on YARN (EMR)
>>>
>>> I noticed that my long running jobs always failed after 1h 40 min
>>> (6000s) with the exceptions below.
>>>
>>> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default
>>>
>>> I changed the settings to the following and it solve my issue.
>>>
>>> "spark.akka.heartbeat.pauses": "60000s",
>>> "spark.akka.heartbeat.interval": "10000s"
>>>
>>>
>>>
>>> RROR ErrorMonitor - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>> java.lang.OutOfMemoryError: Java heap space
>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>> java.lang.OutOfMemoryError: Java heap space
>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down ActorSystem [sparkDriver]
>>> java.lang.OutOfMemoryError: Java heap space
>>> at java.util.Arrays.copyOf(Arrays.java:2271)
>>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
>>> at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
>>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
>>> at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
>>> at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
>>> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
>>> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>>> java.lang.OutOfMemoryError: Java heap space
>>> at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
>>> at akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>>
>>> at
>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>>
>>>
>>
>>
>
Re: Spark fails after 6000s because of akka
Posted by Alexander Pivovarov <ap...@gmail.com>.
Or this message
Exception in thread "main" org.apache.spark.SparkException: Job
cancelled because SparkContext was shut down
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702)
at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514)
at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438)
at org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1723)
at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
at com.radius.distiller.Execute$.run(Execute.scala:56)
at com.radius.distiller.Execute$.main(Execute.scala:33)
at com.radius.distiller.Execute.main(Execute.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
On Sun, Dec 20, 2015 at 11:28 AM, Alexander Pivovarov <ap...@gmail.com>
wrote:
> it can also fail with the following message
>
> Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 133 in stage 33.1 failed 4 times, most recent failure: Lost task 133.3 in stage 33.1 (TID 172737, ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
> ... 1 more
>
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
> at scala.Option.foreach(Option.scala:236)
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
> at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
> at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
> at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
> at com.radius.distiller.Execute$.run(Execute.scala:56)
> at com.radius.distiller.Execute$.main(Execute.scala:33)
> at com.radius.distiller.Execute.main(Execute.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> Caused by: java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048
> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
> at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
> at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
> at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
> at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
> at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
> at java.util.concurrent.FutureTask.run(FutureTask.java:262)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048
> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
> at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
> at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
> at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
> at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
> at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
> at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
> at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
> ... 1 more
>
>
> On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <
> apivovarov@gmail.com> wrote:
>
>> I run Spark 1.5.2 on YARN (EMR)
>>
>> I noticed that my long running jobs always failed after 1h 40 min
>> (6000s) with the exceptions below.
>>
>> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default
>>
>> I changed the settings to the following and it solve my issue.
>>
>> "spark.akka.heartbeat.pauses": "60000s",
>> "spark.akka.heartbeat.interval": "10000s"
>>
>>
>>
>> RROR ErrorMonitor - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>> java.lang.OutOfMemoryError: Java heap space
>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>> java.lang.OutOfMemoryError: Java heap space
>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down ActorSystem [sparkDriver]
>> java.lang.OutOfMemoryError: Java heap space
>> at java.util.Arrays.copyOf(Arrays.java:2271)
>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
>> at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
>> at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
>> at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
>> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
>> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
>> java.lang.OutOfMemoryError: Java heap space
>> at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
>> at akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>>
>> at
>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>>
>>
>
>
Re: Spark fails after 6000s because of akka
Posted by Alexander Pivovarov <ap...@gmail.com>.
it can also fail with the following message
Exception in thread "main" org.apache.spark.SparkException: Job
aborted due to stage failure: Task 133 in stage 33.1 failed 4 times,
most recent failure: Lost task 133.3 in stage 33.1 (TID 172737,
ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.ConnectException: Connection refused:
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
... 1 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
at com.radius.distiller.Execute$.run(Execute.scala:56)
at com.radius.distiller.Execute$.main(Execute.scala:33)
at com.radius.distiller.Execute.main(Execute.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.IOException: Failed to connect to
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.ConnectException: Connection refused:
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
... 1 more
On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <ap...@gmail.com>
wrote:
> I run Spark 1.5.2 on YARN (EMR)
>
> I noticed that my long running jobs always failed after 1h 40 min (6000s)
> with the exceptions below.
>
> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default
>
> I changed the settings to the following and it solve my issue.
>
> "spark.akka.heartbeat.pauses": "60000s",
> "spark.akka.heartbeat.interval": "10000s"
>
>
>
> RROR ErrorMonitor - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> at java.util.Arrays.copyOf(Arrays.java:2271)
> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
> at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
> at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
> at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
> at akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>
> at
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>
>