You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@spark.apache.org by Trident <cw...@vip.qq.com> on 2014/10/06 08:00:29 UTC
Too big data Spark SQL on Hive table on version 1.0.2 has some strange output
Dear Developers,
I'm limited in using Spark 1.0.2 currently.
I use Spark SQL on Hive table to load amplab benchmark, which is 25.6GiB approximately.
I run:
CREATE EXTERNAL TABLE uservisits (sourceIP STRING,destURL STRING, visitDate STRING,adRevenue DOUBLE,userAgent STRING,countryCode STRING, languageCode STRING,searchWord STRING,duration INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001" STORED AS SEQUENCEFILE LOCATION "/public/xxxx/data/uservisits"
okay!
I run:
SELECT COUNT(*) from uservisits
okay! the result is correct
but when I run:
SELECT SUBSTR(sourceIP, 1, 8), SUM(adRevenue) FROM uservisits GROUP BY SUBSTR(sourceIP, 1, 8)
There are some error messages (i stronger and underline some important message)
mainly two problems:
akka => Timed out
GC => Out of memory
what should I do?
...
14/10/05 23:45:18 INFO MemoryStore: Block broadcast_2 of size 158188 dropped from memory (free 308752285)
14/10/05 23:45:40 ERROR BlockManagerMaster: Failed to remove shuffle 4
akka.pattern.AskTimeoutException: Timed out
at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
at akka.actor.Scheduler$$anon$11.run(Scheduler.scala:118)
at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:455)
at akka.actor.LightArrayRevolverScheduler$$anon$12.executeBucket$1(Scheduler.scala:407)
at akka.actor.LightArrayRevolverScheduler$$anon$12.nextTick(Scheduler.scala:411)
at akka.actor.LightArrayRevolverScheduler$$anon$12.run(Scheduler.scala:363)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:45:47 ERROR BlockManagerMaster: Failed to remove shuffle 0
akka.pattern.AskTimeoutException: Timed out
at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
at akka.actor.Scheduler$$anon$11.run(Scheduler.scala:118)
at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:455)
at akka.actor.LightArrayRevolverScheduler$$anon$12.executeBucket$1(Scheduler.scala:407)
at akka.actor.LightArrayRevolverScheduler$$anon$12.nextTick(Scheduler.scala:411)
at akka.actor.LightArrayRevolverScheduler$$anon$12.run(Scheduler.scala:363)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:45:46 ERROR BlockManagerMaster: Failed to remove shuffle 2
akka.pattern.AskTimeoutException: Timed out
at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
at akka.actor.Scheduler$$anon$11.run(Scheduler.scala:118)
at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:455)
at akka.actor.LightArrayRevolverScheduler$$anon$12.executeBucket$1(Scheduler.scala:407)
at akka.actor.LightArrayRevolverScheduler$$anon$12.nextTick(Scheduler.scala:411)
at akka.actor.LightArrayRevolverScheduler$$anon$12.run(Scheduler.scala:363)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:45:45 ERROR BlockManagerMaster: Failed to remove shuffle 1
akka.pattern.AskTimeoutException: Timed out
at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
at akka.actor.Scheduler$$anon$11.run(Scheduler.scala:118)
at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:455)
at akka.actor.LightArrayRevolverScheduler$$anon$12.executeBucket$1(Scheduler.scala:407)
at akka.actor.LightArrayRevolverScheduler$$anon$12.nextTick(Scheduler.scala:411)
at akka.actor.LightArrayRevolverScheduler$$anon$12.run(Scheduler.scala:363)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:45:40 ERROR BlockManagerMaster: Failed to remove shuffle 3
akka.pattern.AskTimeoutException: Timed out
at akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334)
at akka.actor.Scheduler$$anon$11.run(Scheduler.scala:118)
at scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694)
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691)
at akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:455)
at akka.actor.LightArrayRevolverScheduler$$anon$12.executeBucket$1(Scheduler.scala:407)
at akka.actor.LightArrayRevolverScheduler$$anon$12.nextTick(Scheduler.scala:411)
at akka.actor.LightArrayRevolverScheduler$$anon$12.run(Scheduler.scala:363)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:46:31 ERROR Executor: Exception in task ID 5280
java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.apache.spark.sql.catalyst.expressions.SumFunction.<init>(aggregates.scala:351)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:243)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:230)
at org.apache.spark.sql.execution.Aggregate.org$apache$spark$sql$execution$Aggregate$$newAggregateBuffer(Aggregate.scala:99)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:163)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:46:53 ERROR Executor: Exception in task ID 5270
java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOfRange(Arrays.java:3664)
at java.lang.String.<init>(String.java:201)
at java.nio.HeapCharBuffer.toString(HeapCharBuffer.java:567)
at java.nio.CharBuffer.toString(CharBuffer.java:1241)
at org.apache.hadoop.io.Text.decode(Text.java:350)
at org.apache.hadoop.io.Text.decode(Text.java:327)
at org.apache.hadoop.io.Text.toString(Text.java:254)
at org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector.getPrimitiveJavaObject(LazyStringObjectInspector.java:52)
at org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector.getPrimitiveJavaObject(LazyStringObjectInspector.java:28)
at org.apache.spark.sql.hive.HiveInspectors$class.unwrapData(hiveUdfs.scala:287)
at org.apache.spark.sql.hive.execution.HiveTableScan.unwrapData(HiveTableScan.scala:48)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$attributeFunctions$1$$anonfun$apply$3.apply(HiveTableScan.scala:101)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$attributeFunctions$1$$anonfun$apply$3.apply(HiveTableScan.scala:99)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$12$$anonfun$apply$5.apply(HiveTableScan.scala:203)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$12$$anonfun$apply$5.apply(HiveTableScan.scala:200)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:159)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
14/10/05 23:47:03 INFO ShuffleBlockManager: Deleted all files for shuffle 4
14/10/05 23:47:05 INFO ShuffleBlockManager: Deleted all files for shuffle 1
14/10/05 23:47:04 INFO ShuffleBlockManager: Deleted all files for shuffle 2
14/10/05 23:47:03 INFO ShuffleBlockManager: Deleted all files for shuffle 3
14/10/05 23:47:03 INFO ShuffleBlockManager: Deleted all files for shuffle 0
14/10/05 23:47:09 INFO TaskSetManager: Starting task 14.0:32 as TID 5290 on executor localhost: localhost (PROCESS_LOCAL)
14/10/05 23:47:17 ERROR ExecutorUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-92,5,main]
java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOfRange(Arrays.java:3664)
at java.lang.String.<init>(String.java:201)
at java.nio.HeapCharBuffer.toString(HeapCharBuffer.java:567)
at java.nio.CharBuffer.toString(CharBuffer.java:1241)
at org.apache.hadoop.io.Text.decode(Text.java:350)
at org.apache.hadoop.io.Text.decode(Text.java:327)
at org.apache.hadoop.io.Text.toString(Text.java:254)
at org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector.getPrimitiveJavaObject(LazyStringObjectInspector.java:52)
at org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector.getPrimitiveJavaObject(LazyStringObjectInspector.java:28)
at org.apache.spark.sql.hive.HiveInspectors$class.unwrapData(hiveUdfs.scala:287)
at org.apache.spark.sql.hive.execution.HiveTableScan.unwrapData(HiveTableScan.scala:48)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$attributeFunctions$1$$anonfun$apply$3.apply(HiveTableScan.scala:101)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$attributeFunctions$1$$anonfun$apply$3.apply(HiveTableScan.scala:99)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$12$$anonfun$apply$5.apply(HiveTableScan.scala:203)
at org.apache.spark.sql.hive.execution.HiveTableScan$$anonfun$12$$anonfun$apply$5.apply(HiveTableScan.scala:200)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:159)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
14/10/05 23:47:22 ERROR Executor: Exception in task ID 5267
java.lang.OutOfMemoryError: GC overhead limit exceeded
at scala.collection.mutable.ListBuffer.$plus$eq(ListBuffer.scala:164)
at scala.collection.mutable.ListBuffer.$plus$eq(ListBuffer.scala:45)
at scala.collection.SeqLike$$anonfun$distinct$1.apply(SeqLike.scala:495)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.SeqLike$class.distinct(SeqLike.scala:493)
at scala.collection.AbstractSeq.distinct(Seq.scala:40)
at org.apache.spark.sql.catalyst.expressions.Coalesce.resolved$lzycompute(nullFunctions.scala:34)
at org.apache.spark.sql.catalyst.expressions.Coalesce.resolved(nullFunctions.scala:34)
at org.apache.spark.sql.catalyst.expressions.Coalesce.dataType(nullFunctions.scala:38)
at org.apache.spark.sql.catalyst.expressions.Expression.n2(Expression.scala:100)
at org.apache.spark.sql.catalyst.expressions.Add.eval(arithmetic.scala:58)
at org.apache.spark.sql.catalyst.expressions.MutableLiteral.update(literals.scala:72)
at org.apache.spark.sql.catalyst.expressions.SumFunction.update(aggregates.scala:358)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:169)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:47:17 ERROR ExecutorUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-100,5,main]
java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.apache.spark.sql.catalyst.expressions.SumFunction.<init>(aggregates.scala:351)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:243)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:230)
at org.apache.spark.sql.execution.Aggregate.org$apache$spark$sql$execution$Aggregate$$newAggregateBuffer(Aggregate.scala:99)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:163)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:47:31 INFO TaskSetManager: Serialized task 14.0:32 as 4996 bytes in 13419 ms
14/10/05 23:47:52 ERROR Executor: Exception in task ID 5288
java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.apache.spark.sql.catalyst.expressions.SumFunction.<init>(aggregates.scala:353)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:243)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:230)
at org.apache.spark.sql.execution.Aggregate.org$apache$spark$sql$execution$Aggregate$$newAggregateBuffer(Aggregate.scala:99)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:163)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:47:52 INFO Executor: Running task ID 5290
14/10/05 23:47:52 ERROR ExecutorUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-97,5,main]
java.lang.OutOfMemoryError: GC overhead limit exceeded
at scala.collection.mutable.ListBuffer.$plus$eq(ListBuffer.scala:164)
at scala.collection.mutable.ListBuffer.$plus$eq(ListBuffer.scala:45)
at scala.collection.SeqLike$$anonfun$distinct$1.apply(SeqLike.scala:495)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.SeqLike$class.distinct(SeqLike.scala:493)
at scala.collection.AbstractSeq.distinct(Seq.scala:40)
at org.apache.spark.sql.catalyst.expressions.Coalesce.resolved$lzycompute(nullFunctions.scala:34)
at org.apache.spark.sql.catalyst.expressions.Coalesce.resolved(nullFunctions.scala:34)
at org.apache.spark.sql.catalyst.expressions.Coalesce.dataType(nullFunctions.scala:38)
at org.apache.spark.sql.catalyst.expressions.Expression.n2(Expression.scala:100)
at org.apache.spark.sql.catalyst.expressions.Add.eval(arithmetic.scala:58)
at org.apache.spark.sql.catalyst.expressions.MutableLiteral.update(literals.scala:72)
at org.apache.spark.sql.catalyst.expressions.SumFunction.update(aggregates.scala:358)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:169)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
14/10/05 23:47:52 INFO TaskSetManager: Starting task 14.0:33 as TID 5291 on executor localhost: localhost (PROCESS_LOCAL)
14/10/05 23:47:52 ERROR ExecutorUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-91,5,main]
java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.apache.spark.sql.catalyst.expressions.SumFunction.<init>(aggregates.scala:353)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:243)
at org.apache.spark.sql.catalyst.expressions.Sum.newInstance(aggregates.scala:230)
at org.apache.spark.sql.execution.Aggregate.org$apache$spark$sql$execution$Aggregate$$newAggregateBuffer(Aggregate.scala:99)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:163)
at org.apache.spark.sql.execution.Aggregate$$anonfun$execute$1$$anonfun$7.apply(Aggregate.scala:153)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:571)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:262)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:229)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:158)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:183)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)