You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@spark.apache.org by Peter Rudenko <pe...@gmail.com> on 2015/06/25 15:35:41 UTC

[SQL] codegen on wide dataset throws StackOverflow

Hi, i have a small but very wide dataset (2000 columns). Trying to 
optimize Dataframe pipeline for it, since it behaves very poorly 
comparing to rdd operation.
With spark.sql.codegen=true it throws StackOverflow:

15/06/25 16:27:16 INFO CacheManager: Partition rdd_12_3 not found, 
computing it 15/06/25 16:27:16 INFO HadoopRDD: Input split: 
file:/home/peter/validation.csv:0+337768 15/06/25 16:27:16 INFO 
CacheManager: Partition rdd_12_1 not found, computing it 15/06/25 
16:27:16 INFO HadoopRDD: Input split: 
file:/home/peter/work/train.csv:0+15540706 15/06/25 16:27:16 INFO 
CacheManager: Partition rdd_12_0 not found, computing it 15/06/25 
16:27:16 INFO HadoopRDD: Input split: 
file:/home/peter/holdout.csv:0+336296 15/06/25 16:27:16 INFO 
CacheManager: Partition rdd_12_2 not found, computing it 15/06/25 
16:27:16 INFO HadoopRDD: Input split: 
file:/home/peter/train.csv:15540706+14866642 15/06/25 16:27:17 ERROR 
Executor: Exception in task 1.0 in stage 1.0 (TID 2) 
org.spark-project.guava.util.concurrent.ExecutionError: 
java.lang.StackOverflowError at 
org.spark-project.guava.cache.LocalCache$Segment.get(LocalCache.java:2261) 
at org.spark-project.guava.cache.LocalCache.get(LocalCache.java:4000) at 
org.spark-project.guava.cache.LocalCache.getOrLoad(LocalCache.java:4004) 
at 
org.spark-project.guava.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4874) 
at 
org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:105) 
at 
org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:102) 
at 
org.apache.spark.sql.execution.SparkPlan.newMutableProjection(SparkPlan.scala:170) 
at 
org.apache.spark.sql.execution.Project.buildProjection$lzycompute(basicOperators.scala:38) 
at 
org.apache.spark.sql.execution.Project.buildProjection(basicOperators.scala:38) 
at 
org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:41) 
at 
org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:40) 
at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) 
at 
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) 
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) 
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) 
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at 
org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:242) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) 
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) 
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) 
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70) 
at 
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) 
at org.apache.spark.scheduler.Task.run(Task.scala:70) at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213) at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
at java.lang.Thread.run(Thread.java:745) Caused by: 
java.lang.StackOverflowError at 
scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1042) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044) 
at 
scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047) 
at scala.reflect.internal.Symbols$Symbol.fullName(Symbols.scala:1036) at 
scala.reflect.internal.Symbols$Symbol.fullName(Symbols.scala:1052) at 
scala.reflect.internal.Types$TypeRef.needsPreString(Types.scala:2462) at 
scala.reflect.internal.Types$TypeRef.preString(Types.scala:2465) at 
scala.reflect.internal.Types$TypeRef.safeToString(Types.scala:2514) at 
scala.reflect.internal.Types$class.typeToString(Types.scala:7345) at 
scala.reflect.runtime.JavaUniverse.scala$reflect$runtime$SynchronizedTypes$$super$typeToString(JavaUniverse.scala:12) 
at 
scala.reflect.runtime.SynchronizedTypes$class.typeToString(SynchronizedTypes.scala:79) 
at 
scala.reflect.runtime.JavaUniverse.typeToString(JavaUniverse.scala:12) 
at scala.reflect.internal.Types$Type.toString(Types.scala:1018) at 
scala.reflect.internal.Printers$TreePrinter.printTree(Printers.scala:398) at 
scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:446) 
at 
scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:443) 
at 
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) 
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34) 
at scala.reflect.internal.Printers$TreePrinter.print(Printers.scala:443) 
at 
scala.reflect.internal.Printers$TreePrinter.printOpt(Printers.scala:159) 
at 
scala.reflect.internal.Printers$TreePrinter.printTree(Printers.scala:218) at 
scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:446) 
at 
scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:443) 
at 
scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) 
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34) 
at scala.reflect.internal.Printers$TreePrinter.print(Printers.scala:443) 
at 
scala.reflect.internal.Printers$TreePrinter$$anonfun$printColumn$2.apply(Printers.scala:95) 
at 
scala.reflect.internal.Printers$TreePrinter$$anonfun$printColumn$2.apply(Printers.scala:95) 
at 
scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89) 
at 
scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89) 
at 
scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89) 
at 
scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89) 
at 
scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89) 
at 
scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89) 
at 
scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89) 
at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)

             ...

For thin dataset it works, but fails for wide one.

Thanks,
Peter Rudenko

Re: [SQL] codegen on wide dataset throws StackOverflow

Posted by Peter Rudenko <pe...@gmail.com>.
I'm using spark-1.4.0. Sure will try to make steps to reproduce and file 
a JIRA ticket.

Thanks,
Peter Rudenko

On 2015-06-26 11:14, Josh Rosen wrote:
> Which Spark version are you using?  Can you file a JIRA for this issue?
>
> On Thu, Jun 25, 2015 at 6:35 AM, Peter Rudenko 
> <petro.rudenko@gmail.com <ma...@gmail.com>> wrote:
>
>     Hi, i have a small but very wide dataset (2000 columns). Trying to
>     optimize Dataframe pipeline for it, since it behaves very poorly
>     comparing to rdd operation.
>     With spark.sql.codegen=true it throws StackOverflow:
>
>     15/06/25 16:27:16 INFO CacheManager: Partition rdd_12_3 not found,
>     computing it 15/06/25 16:27:16 INFO HadoopRDD: Input split:
>     file:/home/peter/validation.csv:0+337768 15/06/25 16:27:16 INFO
>     CacheManager: Partition rdd_12_1 not found, computing it 15/06/25
>     16:27:16 INFO HadoopRDD: Input split:
>     file:/home/peter/work/train.csv:0+15540706 15/06/25 16:27:16 INFO
>     CacheManager: Partition rdd_12_0 not found, computing it 15/06/25
>     16:27:16 INFO HadoopRDD: Input split:
>     file:/home/peter/holdout.csv:0+336296 15/06/25 16:27:16 INFO
>     CacheManager: Partition rdd_12_2 not found, computing it 15/06/25
>     16:27:16 INFO HadoopRDD: Input split:
>     file:/home/peter/train.csv:15540706+14866642 15/06/25 16:27:17
>     ERROR Executor: Exception in task 1.0 in stage 1.0 (TID 2)
>     org.spark-project.guava.util.concurrent.ExecutionError:
>     java.lang.StackOverflowError at
>     org.spark-project.guava.cache.LocalCache$Segment.get(LocalCache.java:2261)
>     at
>     org.spark-project.guava.cache.LocalCache.get(LocalCache.java:4000)
>     at
>     org.spark-project.guava.cache.LocalCache.getOrLoad(LocalCache.java:4004)
>     at
>     org.spark-project.guava.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4874)
>     at
>     org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:105)
>     at
>     org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:102)
>     at
>     org.apache.spark.sql.execution.SparkPlan.newMutableProjection(SparkPlan.scala:170)
>     at
>     org.apache.spark.sql.execution.Project.buildProjection$lzycompute(basicOperators.scala:38)
>     at
>     org.apache.spark.sql.execution.Project.buildProjection(basicOperators.scala:38)
>     at
>     org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:41)
>     at
>     org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:40)
>     at
>     org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
>     at
>     org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
>     at
>     org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at
>     org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
>     at
>     org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:242) at
>     org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at
>     org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at
>     org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
>     at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
>     at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at
>     org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70)
>     at
>     org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
>     at org.apache.spark.scheduler.Task.run(Task.scala:70) at
>     org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
>     at
>     java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
>     at
>     java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
>     at java.lang.Thread.run(Thread.java:745) Caused by:
>     java.lang.StackOverflowError at
>     scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1042)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullName(Symbols.scala:1036)
>     at
>     scala.reflect.internal.Symbols$Symbol.fullName(Symbols.scala:1052)
>     at
>     scala.reflect.internal.Types$TypeRef.needsPreString(Types.scala:2462)
>     at
>     scala.reflect.internal.Types$TypeRef.preString(Types.scala:2465)
>     at
>     scala.reflect.internal.Types$TypeRef.safeToString(Types.scala:2514) at
>     scala.reflect.internal.Types$class.typeToString(Types.scala:7345)
>     at
>     scala.reflect.runtime.JavaUniverse.scala$reflect$runtime$SynchronizedTypes$$super$typeToString(JavaUniverse.scala:12)
>     at
>     scala.reflect.runtime.SynchronizedTypes$class.typeToString(SynchronizedTypes.scala:79)
>     at
>     scala.reflect.runtime.JavaUniverse.typeToString(JavaUniverse.scala:12)
>     at scala.reflect.internal.Types$Type.toString(Types.scala:1018) at
>     scala.reflect.internal.Printers$TreePrinter.printTree(Printers.scala:398)
>     at
>     scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:446)
>     at
>     scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:443)
>     at
>     scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
>     at
>     scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
>     at
>     scala.reflect.internal.Printers$TreePrinter.print(Printers.scala:443)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printOpt(Printers.scala:159)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printTree(Printers.scala:218)
>     at
>     scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:446)
>     at
>     scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:443)
>     at
>     scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
>     at
>     scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
>     at
>     scala.reflect.internal.Printers$TreePrinter.print(Printers.scala:443)
>     at
>     scala.reflect.internal.Printers$TreePrinter$$anonfun$printColumn$2.apply(Printers.scala:95)
>     at
>     scala.reflect.internal.Printers$TreePrinter$$anonfun$printColumn$2.apply(Printers.scala:95)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>     at
>     scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>
>                 ...
>
>     For thin dataset it works, but fails for wide one.
>
>     Thanks,
>     Peter Rudenko
>
>


Re: [SQL] codegen on wide dataset throws StackOverflow

Posted by Josh Rosen <ro...@gmail.com>.
Which Spark version are you using?  Can you file a JIRA for this issue?

On Thu, Jun 25, 2015 at 6:35 AM, Peter Rudenko <pe...@gmail.com>
wrote:

>  Hi, i have a small but very wide dataset (2000 columns). Trying to
> optimize Dataframe pipeline for it, since it behaves very poorly comparing
> to rdd operation.
> With spark.sql.codegen=true it throws StackOverflow:
>
> 15/06/25 16:27:16 INFO CacheManager: Partition rdd_12_3 not found, computing it
> 15/06/25 16:27:16 INFO HadoopRDD: Input split: file:/home/peter/validation.csv:0+337768
> 15/06/25 16:27:16 INFO CacheManager: Partition rdd_12_1 not found, computing it
> 15/06/25 16:27:16 INFO HadoopRDD: Input split: file:/home/peter/work/train.csv:0+15540706
> 15/06/25 16:27:16 INFO CacheManager: Partition rdd_12_0 not found, computing it
> 15/06/25 16:27:16 INFO HadoopRDD: Input split: file:/home/peter/holdout.csv:0+336296
> 15/06/25 16:27:16 INFO CacheManager: Partition rdd_12_2 not found, computing it
> 15/06/25 16:27:16 INFO HadoopRDD: Input split: file:/home/peter/train.csv:15540706+14866642
> 15/06/25 16:27:17 ERROR Executor: Exception in task 1.0 in stage 1.0 (TID 2)
> org.spark-project.guava.util.concurrent.ExecutionError: java.lang.StackOverflowError
> 	at org.spark-project.guava.cache.LocalCache$Segment.get(LocalCache.java:2261)
> 	at org.spark-project.guava.cache.LocalCache.get(LocalCache.java:4000)
> 	at org.spark-project.guava.cache.LocalCache.getOrLoad(LocalCache.java:4004)
> 	at org.spark-project.guava.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4874)
> 	at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:105)
> 	at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:102)
> 	at org.apache.spark.sql.execution.SparkPlan.newMutableProjection(SparkPlan.scala:170)
> 	at org.apache.spark.sql.execution.Project.buildProjection$lzycompute(basicOperators.scala:38)
> 	at org.apache.spark.sql.execution.Project.buildProjection(basicOperators.scala:38)
> 	at org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:41)
> 	at org.apache.spark.sql.execution.Project$$anonfun$1.apply(basicOperators.scala:40)
> 	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
> 	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:69)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:242)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70)
> 	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
> 	at org.apache.spark.scheduler.Task.run(Task.scala:70)
> 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> 	at java.lang.Thread.run(Thread.java:745)
> Caused by: java.lang.StackOverflowError
> 	at scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1042)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameInternal(Symbols.scala:1044)
> 	at scala.reflect.internal.Symbols$Symbol.fullNameAsName(Symbols.scala:1047)
> 	at scala.reflect.internal.Symbols$Symbol.fullName(Symbols.scala:1036)
> 	at scala.reflect.internal.Symbols$Symbol.fullName(Symbols.scala:1052)
> 	at scala.reflect.internal.Types$TypeRef.needsPreString(Types.scala:2462)
> 	at scala.reflect.internal.Types$TypeRef.preString(Types.scala:2465)
> 	at scala.reflect.internal.Types$TypeRef.safeToString(Types.scala:2514)
> 	at scala.reflect.internal.Types$class.typeToString(Types.scala:7345)
> 	at scala.reflect.runtime.JavaUniverse.scala$reflect$runtime$SynchronizedTypes$$super$typeToString(JavaUniverse.scala:12)
> 	at scala.reflect.runtime.SynchronizedTypes$class.typeToString(SynchronizedTypes.scala:79)
> 	at scala.reflect.runtime.JavaUniverse.typeToString(JavaUniverse.scala:12)
> 	at scala.reflect.internal.Types$Type.toString(Types.scala:1018)
> 	at scala.reflect.internal.Printers$TreePrinter.printTree(Printers.scala:398)
> 	at scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:446)
> 	at scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:443)
> 	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
> 	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
> 	at scala.reflect.internal.Printers$TreePrinter.print(Printers.scala:443)
> 	at scala.reflect.internal.Printers$TreePrinter.printOpt(Printers.scala:159)
> 	at scala.reflect.internal.Printers$TreePrinter.printTree(Printers.scala:218)
> 	at scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:446)
> 	at scala.reflect.internal.Printers$TreePrinter$$anonfun$print$1.apply(Printers.scala:443)
> 	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
> 	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:34)
> 	at scala.reflect.internal.Printers$TreePrinter.print(Printers.scala:443)
> 	at scala.reflect.internal.Printers$TreePrinter$$anonfun$printColumn$2.apply(Printers.scala:95)
> 	at scala.reflect.internal.Printers$TreePrinter$$anonfun$printColumn$2.apply(Printers.scala:95)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
> 	at scala.reflect.internal.Printers$TreePrinter.printSeq(Printers.scala:89)
>
>             ...
>
> For thin dataset it works, but fails for wide one.
>
> Thanks,
> Peter Rudenko
>