You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Kürşat Kurt <ku...@kursatkurt.com> on 2016/10/29 20:51:08 UTC
Out Of Memory issue
Hi;
While training NaiveBayes classification, i am getting OOM.
What is wrong with these parameters?
Here is the spark-submit command: ./spark-submit --class main.scala.Test1
--master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark 2.0.1.
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on
89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on
89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory
map of 31.8 GB to disk (1 time so far)
16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory
map of 31.8 GB to disk (2 times so far)
16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory
map of 31.8 GB to disk (3 times so far)
16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from
org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76
16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID
31)
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:222)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at
com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at
com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.j
ava:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSeriali
zer.scala:229)
at
org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala
:159)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readN
extItem(ExternalAppendOnlyMap.scala:515)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNe
xt(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$
apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNex
tHashCode(ExternalAppendOnlyMap.scala:336)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5
9)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:407)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.fore
ach(ExternalAppendOnlyMap.scala:302)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(E
xternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBu
ffer(ExternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception in
thread Thread[Executor task launch worker-7,5,main]
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:222)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at
com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at
com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.j
ava:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSeriali
zer.scala:229)
at
org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala
:159)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readN
extItem(ExternalAppendOnlyMap.scala:515)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNe
xt(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$
apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNex
tHashCode(ExternalAppendOnlyMap.scala:336)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5
9)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:407)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.fore
ach(ExternalAppendOnlyMap.scala:302)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(E
xternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBu
ffer(ExternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig:
Compression: SNAPPY
Oct 29, 2016 11:25:48 PM INFO:
org.apache.parquet.hadoop.ParquetOutputFormat: Parquet block size to
134217728
Oct 29, 2016 11:25:48 PM INFO:
org.apache.parquet.hadoop.ParquetOutputFormat: Parquet page size to 1048576
Oct 29, 2016 11:25:48 PM INFO:
org.apache.parquet.hadoop.ParquetOutputFormat: Parquet dictionary page size
to 1048576
Oct 29, 2016 11:25:48 PM INFO:
org.apache.parquet.hadoop.ParquetOutputFormat: Dictionary is on
Oct 29, 2016 11:25:48 PM INFO:
org.apache.parquet.hadoop.ParquetOutputFormat: Validation is off
Oct 29, 2016 11:25:48 PM INFO:
org.apache.parquet.hadoop.ParquetOutputFormat: Writer version is:
PARQUET_1_0
Oct 29, 2016 11:25:49 PM INFO:
org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem
columnStore to file. allocated memory: 4,396,549
Oct 29, 2016 11:25:49 PM INFO:
org.apache.parquet.hadoop.ColumnChunkPageWriteStore: written 4,157,541B for
[labels, list, element] BINARY: 142,207 values, 5,600,131B raw, 4,156,878B
comp, 6 pages, encodings: [PLAIN, RLE]
16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID 31,
localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:222)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at
com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at
com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.j
ava:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSeriali
zer.scala:229)
at
org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala
:159)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readN
extItem(ExternalAppendOnlyMap.scala:515)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNe
xt(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$
apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNex
tHashCode(ExternalAppendOnlyMap.scala:336)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5
9)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:407)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.fore
ach(ExternalAppendOnlyMap.scala:302)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(E
xternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBu
ffer(ExternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1 times;
aborting job
16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose tasks
have all completed, from pool
16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at
NaiveBayes.scala:400) failed in 570.233 s
16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at
NaiveBayes.scala:400, took 934.966523 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due
to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure:
Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError:
Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:222)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at
com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at
com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.j
ava:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSeriali
zer.scala:229)
at
org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala
:159)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readN
extItem(ExternalAppendOnlyMap.scala:515)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNe
xt(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$
apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNex
tHashCode(ExternalAppendOnlyMap.scala:336)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5
9)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:407)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.fore
ach(ExternalAppendOnlyMap.scala:302)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(E
xternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBu
ffer(ExternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGSchedu
ler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGSched
uler.scala:1442)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGSched
uler.scala:1441)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5
9)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply
(DAGScheduler.scala:811)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply
(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.sca
la:811)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGSched
uler.scala:1667)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSchedul
er.scala:1622)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSchedul
er.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:15
1)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:11
2)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at
org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:400)
at
org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:507
)
at
org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:114)
at
org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:76)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
at
org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:149)
at
org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:145)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at
scala.collection.IterableViewLike$Transformed$class.foreach(IterableViewLike
.scala:44)
at
scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:3
7)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
at main.scala.Test1$.main(Test1.scala:172)
at main.scala.Test1.main(Test1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62
)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl
.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$ru
nMain(SparkSubmit.scala:736)
at
org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at
org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:222)
at
com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySer
ializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at
com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at
com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.j
ava:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at
com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at
org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSeriali
zer.scala:229)
at
org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala
:159)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readN
extItem(ExternalAppendOnlyMap.scala:515)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNe
xt(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$
apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNex
tHashCode(ExternalAppendOnlyMap.scala:336)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$ano
nfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:5
9)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:407)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next
(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.fore
ach(ExternalAppendOnlyMap.scala:302)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(E
xternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at
org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBu
ffer(ExternalAppendOnlyMap.scala:302)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at
http://89.*************:4040
16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint:
MapOutputTrackerMasterEndpoint stopped!
16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
16/10/29 23:41:28 INFO
OutputCommitCoordinator$OutputCommitCoordinatorEndpoint:
OutputCommitCoordinator stopped!
16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory
/tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce
RE: Out Of Memory issue
Posted by Kürşat Kurt <ku...@kursatkurt.com>.
Hi Jörn;
I am reading 300.000 line csv file. It is “ß” seperated(attached sample file). First column is class name and second column is product name.
Java version is 1.8.108, single node. Furthermore (as you can see in code) i tried random forests and this get OMM too.
Code :
package main.scala
import java.util.Locale
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.sql.SparkSession
import com.hrzafer.reshaturkishstemmer.Resha
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.NGram
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.classification.LogisticRegression
import scala.collection.mutable.ListBuffer
import org.apache.spark.ml.classification.OneVsRest
import org.apache.spark.storage.StorageLevel
object Test1 {
var num = 50;
var savePath = "hdfs://localhost:54310/SparkWork/SparkModel/";
var stemmer = Resha.Instance
var STOP_WORDS: Set[String] = Set();
def cropSentence(s: String) = {
s.replaceAll("\\([^\\)]*\\)", "")
.replaceAll("(\\d+)(gb|GB)", "$1 $2")
.replaceAll(" - ", " ")
.replaceAll("-", " ")
.replaceAll(" tr. ", " ")
.replaceAll(" +", " ")
.replaceAll(",", " ").trim();
}
def main(args: Array[String]): Unit = {
val start1 = System.currentTimeMillis();
val sc = new SparkConf().setAppName("Test")
.set("spark.hadoop.validateOutputSpecs", "false")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val spark = SparkSession.builder.appName("Java Spark").config(sc).getOrCreate();
import spark.implicits._
val mainDataset = spark.sparkContext.textFile("hdfs://localhost:54310/SparkWork/classifications.csv")
.map( _.split("ß"))
.map(tokens => {
var list=new ListBuffer[String]();
var token0=cropSentence(tokens(0).toLowerCase(Locale.forLanguageTag("TR-tr")));
token0.split("\\s+").map {list+=stemmer.stem(_)}
(tokens(1), list.toList.mkString(" "))
}).persist(StorageLevel.MEMORY_AND_DISK).toDF("className","productName");
val classIndexer = new StringIndexer()
.setInputCol("className")
.setOutputCol("label");
val classIndexerModel = classIndexer.fit(mainDataset);
var mainDS=classIndexerModel.transform(mainDataset);
classIndexerModel.write.overwrite.save(savePath + "ClassIndexer");
//Tokenizer
val tokenizer = new Tokenizer()
.setInputCol("productName")
.setOutputCol("words_nonfiltered")
;
//StopWords
val remover = new StopWordsRemover()
.setInputCol("words_nonfiltered")
.setOutputCol("words")
.setStopWords( Array[String]("garanti","garantili","resmi","distribütör","cep","tel","-","//"));
//CountVectorize
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features");
val rfc = new RandomForestClassifier ()
.setLabelCol("label")
.setNumTrees(3)
.setMaxDepth(3)
.setFeatureSubsetStrategy("auto")
.setFeaturesCol("features")
.setImpurity("gini")
.setMaxBins(3);
val nb = new NaiveBayes()
.setSmoothing(0.1)
.setModelType("multinomial")
val pipeline = new Pipeline().setStages(Array(tokenizer,remover,countVectorizer,nb));
val splits = mainDS.randomSplit(Array(0.80,0.20));
val train =splits(0);
//train.show(num,false);
val test = splits(1);
//test.show(num,false);
//mainDataset.show(100,false);
val model = pipeline.fit(train);
model.write.overwrite.save(savePath+"RandomForestClassifier");
//var model=rfc.fit(train);
var result = model.transform(test);
val predictionAndLabels = result.select("prediction", "label");
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy");
System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
}
}
From: Jörn Franke [mailto:jornfranke@gmail.com]
Sent: Sunday, October 30, 2016 12:44 AM
To: Kürşat Kurt <ku...@kursatkurt.com>
Cc: user@spark.apache.org
Subject: Re: Out Of Memory issue
What is the size and format of the input data?
Can you provide more details on your Spark job? Rdd? Dataframe? Etc. Java Version? Is this a single node? It seems your executors and os do not get a lot of memory
On 29 Oct 2016, at 22:51, Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> > wrote:
Hi;
While training NaiveBayes classification, i am getting OOM.
What is wrong with these parameters?
Here is the spark-submit command: ./spark-submit --class main.scala.Test1 --master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark 2.0.1.
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on 89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on 89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (1 time so far)
16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (2 times so far)
16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (3 times so far)
16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76 <ma...@43ab2e76>
16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID 31)
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-7,5,main]
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig: Compression: SNAPPY
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet block size to 134217728
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet dictionary page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Dictionary is on
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Validation is off
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Writer version is: PARQUET_1_0
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem columnStore to file. allocated memory: 4,396,549
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.ColumnChunkPageWriteStore: written 4,157,541B for [labels, list, element] BINARY: 142,207 values, 5,600,131B raw, 4,156,878B comp, 6 pages, encodings: [PLAIN, RLE]
16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1 times; aborting job
16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose tasks have all completed, from pool
16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at NaiveBayes.scala:400) failed in 570.233 s
16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at NaiveBayes.scala:400, took 934.966523 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:400)
at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:507)
at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:114)
at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:76)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:149)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:145)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableViewLike$Transformed$class.foreach(IterableViewLike.scala:44)
at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
at main.scala.Test1$.main(Test1.scala:172)
at main.scala.Test1.main(Test1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at http://89.*************:4040
16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
16/10/29 23:41:28 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory /tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce
Re: Out Of Memory issue
Posted by Nirav Patel <np...@xactlycorp.com>.
When you say "Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD
with spark 2.0.1."
Do you mean you have 64GB of free memory before running spark job? you can
verify using `free -g` on redhat may be same for ubuntu.
Also try JAVA_OPTS as Patrick mentioned. Also you can monitor running job
using `top` or something to check how much memory its actually getting.
On Mon, Oct 31, 2016 at 11:24 PM, Kürşat Kurt <ku...@kursatkurt.com> wrote:
> It is ok, but now getting “Size exceeds Integer.MAX_VALUE”
>
>
>
> 16/10/31 21:53:20 WARN MemoryStore: Not enough space to cache rdd_42_0 in
> memory! (computed 11.3 GB so far)
>
> 16/10/31 21:53:20 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 28.2
> GB (scratch space shared across 2 tasks(s)) = 28.4 GB. Storage limit = 31.8
> GB.
>
> 16/10/31 21:53:20 WARN BlockManager: Persisting block rdd_42_0 to disk
> instead.
>
> 16/10/31 21:56:31 INFO MemoryStore: Will not store rdd_42_1
>
> 16/10/31 21:56:31 WARN MemoryStore: Not enough space to cache rdd_42_1 in
> memory! (computed 25.5 GB so far)
>
> 16/10/31 21:56:31 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 25.4
> GB (scratch space shared across 1 tasks(s)) = 25.6 GB. Storage limit = 31.8
> GB.
>
> 16/10/31 21:56:31 WARN BlockManager: Persisting block rdd_42_1 to disk
> instead.
>
> 16/10/31 22:03:13 INFO BlockManagerInfo: Added rdd_42_1 on disk on
> 89.163.242.124:51975 (size: 12.6 GB)
>
> 16/10/31 22:03:13 ERROR Executor: Exception in task 1.0 in stage 13.0 (TID
> 36)
>
> java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
>
> at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
>
> at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.
> apply(DiskStore.scala:103)
>
> at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.
> apply(DiskStore.scala:91)
>
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.
> scala:1307)
>
> at org.apache.spark.storage.DiskStore.getBytes(DiskStore.
> scala:105)
>
> at org.apache.spark.storage.BlockManager.getLocalValues(
> BlockManager.scala:438)
>
> at org.apache.spark.storage.BlockManager.getOrElseUpdate(
> BlockManager.scala:674)
>
> at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
>
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
>
> at org.apache.spark.rdd.MapPartitionsRDD.compute(
> MapPartitionsRDD.scala:38)
>
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
>
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
>
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:79)
>
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:47)
>
> at org.apache.spark.scheduler.Task.run(Task.scala:86)
>
> at org.apache.spark.executor.Executor$TaskRunner.run(
> Executor.scala:274)
>
> at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1142)
>
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:617)
>
> at java.lang.Thread.run(Thread.java:745)
>
> 16/10/31 22:03:13 WARN TaskSetManager: Lost task 1.0 in stage 13.0 (TID
> 36, localhost): java.lang.IllegalArgumentException: Size exceeds
> Integer.MAX_VALUE
>
> at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
>
> at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.
> apply(DiskStore.scala:103)
>
> at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.
> apply(DiskStore.scala:91)
>
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.
> scala:1307)
>
> at org.apache.spark.storage.DiskStore.getBytes(DiskStore.
> scala:105)
>
> at org.apache.spark.storage.BlockManager.getLocalValues(
> BlockManager.scala:438)
>
> at org.apache.spark.storage.BlockManager.getOrElseUpdate(
> BlockManager.scala:674)
>
> at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
>
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
>
> at org.apache.spark.rdd.MapPartitionsRDD.compute(
> MapPartitionsRDD.scala:38)
>
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
>
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
>
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:79)
>
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:47)
>
> at org.apache.spark.scheduler.Task.run(Task.scala:86)
>
> at org.apache.spark.executor.Executor$TaskRunner.run(
> Executor.scala:274)
>
> at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1142)
>
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:617)
>
> at java.lang.Thread.run(Thread.java:745)
>
>
>
> 16/10/31 22:03:13 ERROR TaskSetManager: Task 1 in stage 13.0 failed 1
> times; aborting job
>
> 16/10/31 22:03:13 INFO TaskSchedulerImpl: Cancelling stage 13
>
> 16/10/31 22:03:13 INFO TaskSchedulerImpl: Stage 13 was cancelled
>
> 16/10/31 22:03:13 INFO Executor: Executor is trying to kill task 0.0 in
> stage 13.0 (TID 35)
>
> 16/10/31 22:03:13 INFO DAGScheduler: ShuffleMapStage 13 (mapPartitions at
> RandomForest.scala:521) failed in 763.029 s
>
> 16/10/31 22:03:13 INFO DAGScheduler: Job 8 failed: collectAsMap at
> RandomForest.scala:550, took 763.405717 s
>
> Exception in thread "main" org.apache.spark.SparkException: Job aborted
> due to stage failure: Task 1 in stage 13.0 failed 1 times, most recent
> failure: Lost task 1.0 in stage 13.0 (TID 36, localhost): java.lang.IllegalArgumentException:
> Size exceeds Integer.MAX_VALUE
>
> at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
>
> at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.
> apply(DiskStore.scala:103)
>
> at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.
> apply(DiskStore.scala:91)
>
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.
> scala:1307)
>
> at org.apache.spark.storage.DiskStore.getBytes(DiskStore.
> scala:105)
>
> at org.apache.spark.storage.BlockManager.getLocalValues(
> BlockManager.scala:438)
>
> at org.apache.spark.storage.BlockManager.getOrElseUpdate(
> BlockManager.scala:674)
>
> at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
>
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
>
> at org.apache.spark.rdd.MapPartitionsRDD.compute(
> MapPartitionsRDD.scala:38)
>
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
>
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
>
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:79)
>
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:47)
>
> at org.apache.spark.scheduler.Task.run(Task.scala:86)
>
> at org.apache.spark.executor.Executor$TaskRunner.run(
> Executor.scala:274)
>
> at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1142)
>
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:617)
>
> at java.lang.Thread.run(Thread.java:745)
>
>
>
>
>
> *From:* Patrick Chen [mailto:czhenjupt@gmail.com]
> *Sent:* Tuesday, November 1, 2016 8:09 AM
> *To:* Kürşat Kurt <ku...@kursatkurt.com>
> *Subject:* Re: Out Of Memory issue
>
>
>
> I think you should set more memory on your heap (JAVA_OPTS -Xmx) , try
> again
>
>
>
> 2016-11-01 12:20 GMT+08:00 Kürşat Kurt <ku...@kursatkurt.com>:
>
> Any idea about this?
>
>
>
> *From:* Kürşat Kurt [mailto:kursat@kursatkurt.com]
> *Sent:* Sunday, October 30, 2016 7:59 AM
> *To:* 'Jörn Franke' <jo...@gmail.com>
> *Cc:* 'user@spark.apache.org' <us...@spark.apache.org>
> *Subject:* RE: Out Of Memory issue
>
>
>
> Hi Jörn;
>
>
>
> I am reading 300.000 line csv file. It is “ß” seperated(attached sample
> file). First column is class name and second column is product name.
>
> Java version is 1.8.108, single node. Furthermore (as you can see in code)
> i tried random forests and this get OMM too.
>
>
>
>
>
>
>
> Code :
>
> package main.scala
>
>
>
> import java.util.Locale
>
> import org.apache.spark.SparkConf
>
> import org.apache.spark.ml.classification.RandomForestClassifier
>
> import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
>
> import org.apache.spark.ml.feature.CountVectorizer
>
> import org.apache.spark.ml.feature.IndexToString
>
> import org.apache.spark.ml.feature.StandardScaler
>
> import org.apache.spark.ml.feature.StopWordsRemover
>
> import org.apache.spark.ml.feature.StringIndexer
>
> import org.apache.spark.ml.feature.Tokenizer
>
> import org.apache.spark.sql.SparkSession
>
> import com.hrzafer.reshaturkishstemmer.Resha
>
> import org.apache.spark.ml.feature.VectorIndexer
>
> import org.apache.spark.ml.feature.IDF
>
> import org.apache.spark.ml.Pipeline
>
> import org.apache.spark.ml.feature.NGram
>
> import org.apache.spark.ml.classification.NaiveBayes
>
> import org.apache.spark.ml.classification.DecisionTreeClassificationModel
>
> import org.apache.spark.ml.classification.DecisionTreeClassifier
>
> import org.apache.spark.ml.classification.LogisticRegression
>
> import scala.collection.mutable.ListBuffer
>
> import org.apache.spark.ml.classification.OneVsRest
>
> import org.apache.spark.storage.StorageLevel
>
>
>
>
>
> object Test1 {
>
>
>
> var num = 50;
>
> var savePath = "hdfs://localhost:54310/SparkWork/SparkModel/";
>
> var stemmer = Resha.Instance
>
>
>
> var STOP_WORDS: Set[String] = Set();
>
>
>
> def cropSentence(s: String) = {
>
> s.replaceAll("\\([^\\)]*\\)", "")
>
> .replaceAll("(\\d+)(gb|GB)", "$1 $2")
>
> .replaceAll(" - ", " ")
>
> .replaceAll("-", " ")
>
> .replaceAll(" tr. ", " ")
>
> .replaceAll(" +", " ")
>
> .replaceAll(",", " ").trim();
>
> }
>
>
>
> def main(args: Array[String]): Unit = {
>
>
>
> val start1 = System.currentTimeMillis();
>
>
>
> val sc = new SparkConf().setAppName("Test")
>
> .set("spark.hadoop.validateOutputSpecs", "false")
>
> .set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
>
>
>
>
>
> val spark = SparkSession.builder.appName("Java Spark").config(sc).getOrCreate();
>
> import spark.implicits._
>
>
>
> val mainDataset = spark.sparkContext.textFile("hdfs://localhost:54310/SparkWork/classifications.csv")
>
> .map( _.split("ß"))
>
> .map(tokens => {
>
> var list=new ListBuffer[String]();
>
> var token0=cropSentence(tokens(0).toLowerCase(Locale.forLanguageTag("TR-tr")));
>
> token0.split("\\s+").map {list+=stemmer.stem(_)}
>
> (tokens(1), list.toList.mkString(" "))
>
> }).persist(StorageLevel.MEMORY_AND_DISK).toDF("className","productName");
>
>
>
>
>
> val classIndexer = new StringIndexer()
>
> .setInputCol("className")
>
> .setOutputCol("label");
>
>
>
> val classIndexerModel = classIndexer.fit(mainDataset);
>
> var mainDS=classIndexerModel.transform(mainDataset);
>
> classIndexerModel.write.overwrite.save(savePath + "ClassIndexer");
>
> //Tokenizer
>
> val tokenizer = new Tokenizer()
>
> .setInputCol("productName")
>
> .setOutputCol("words_nonfiltered")
>
> ;
>
>
>
> //StopWords
>
> val remover = new StopWordsRemover()
>
> .setInputCol("words_nonfiltered")
>
> .setOutputCol("words")
>
> .setStopWords( Array[String]("garanti","garantili","resmi","distribütör","cep","tel","-","//"));
>
>
>
> //CountVectorize
>
>
>
> val countVectorizer = new CountVectorizer()
>
> .setInputCol("words")
>
> .setOutputCol("features");
>
>
>
>
>
> val rfc = new RandomForestClassifier ()
>
> .setLabelCol("label")
>
> .setNumTrees(3)
>
> .setMaxDepth(3)
>
> .setFeatureSubsetStrategy("auto")
>
> .setFeaturesCol("features")
>
> .setImpurity("gini")
>
> .setMaxBins(3);
>
>
>
>
>
> val nb = new NaiveBayes()
>
> .setSmoothing(0.1)
>
> .setModelType("multinomial")
>
>
>
>
>
> val pipeline = new Pipeline().setStages(Array(tokenizer,remover,countVectorizer,nb));
>
>
>
>
>
> val splits = mainDS.randomSplit(Array(0.80,0.20));
>
> val train =splits(0);
>
> //train.show(num,false);
>
> val test = splits(1);
>
> //test.show(num,false);
>
>
>
> //mainDataset.show(100,false);
>
> val model = pipeline.fit(train);
>
> model.write.overwrite.save(savePath+"RandomForestClassifier");
>
> //var model=rfc.fit(train);
>
>
>
>
>
> var result = model.transform(test);
>
>
>
>
>
> val predictionAndLabels = result.select("prediction", "label");
>
> val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy");
>
> System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
>
>
>
>
>
> }
>
> }
>
>
>
> *From:* Jörn Franke [mailto:jornfranke@gmail.com <jo...@gmail.com>]
> *Sent:* Sunday, October 30, 2016 12:44 AM
> *To:* Kürşat Kurt <ku...@kursatkurt.com>
> *Cc:* user@spark.apache.org
> *Subject:* Re: Out Of Memory issue
>
>
>
> What is the size and format of the input data?
>
> Can you provide more details on your Spark job? Rdd? Dataframe? Etc. Java
> Version? Is this a single node? It seems your executors and os do not get a
> lot of memory
>
>
> On 29 Oct 2016, at 22:51, Kürşat Kurt <ku...@kursatkurt.com> wrote:
>
> Hi;
>
>
>
> While training NaiveBayes classification, i am getting OOM.
>
> What is wrong with these parameters?
>
> Here is the spark-submit command: ./spark-submit --class main.scala.Test1
> --master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
>
>
>
> Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark
> 2.0.1.
>
>
>
> 16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on
> 89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
>
> 16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on
> 89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
>
> 16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling
> in-memory map of 31.8 GB to disk (1 time so far)
>
> 16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling
> in-memory map of 31.8 GB to disk (2 times so far)
>
> 16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling
> in-memory map of 31.8 GB to disk (3 times so far)
>
> 16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from
> org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76
>
> 16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID
> 31)
>
> java.lang.OutOfMemoryError: Java heap space
>
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:222)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:205)
>
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
>
> at com.esotericsoftware.kryo.serializers.ObjectField.read(
> ObjectField.java:132)
>
> at com.esotericsoftware.kryo.serializers.FieldSerializer.
> read(FieldSerializer.java:551)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:42)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:33)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at org.apache.spark.serializer.KryoDeserializationStream.
> readObject(KryoSerializer.scala:229)
>
> at org.apache.spark.serializer.DeserializationStream.
> readValue(Serializer.scala:159)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
>
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$External
> Iterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$
> ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
>
> at scala.collection.mutable.ResizableArray$class.foreach(
> ResizableArray.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.foreach(
> ArrayBuffer.scala:48)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.generic.Growable$class.$plus$plus$eq(
> Growable.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:104)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:48)
>
> at scala.collection.TraversableOnce$class.to(
> TraversableOnce.scala:310)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.
> toBuffer(TraversableOnce.scala:302)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.toArray(
> TraversableOnce.scala:289)
>
> 16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception
> in thread Thread[Executor task launch worker-7,5,main]
>
> java.lang.OutOfMemoryError: Java heap space
>
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:222)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:205)
>
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
>
> at com.esotericsoftware.kryo.serializers.ObjectField.read(
> ObjectField.java:132)
>
> at com.esotericsoftware.kryo.serializers.FieldSerializer.
> read(FieldSerializer.java:551)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:42)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:33)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at org.apache.spark.serializer.KryoDeserializationStream.
> readObject(KryoSerializer.scala:229)
>
> at org.apache.spark.serializer.DeserializationStream.
> readValue(Serializer.scala:159)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
>
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$External
> Iterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$
> ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
>
> at scala.collection.mutable.ResizableArray$class.foreach(
> ResizableArray.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.foreach(
> ArrayBuffer.scala:48)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.generic.Growable$class.$plus$plus$eq(
> Growable.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:104)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:48)
>
> at scala.collection.TraversableOnce$class.to(
> TraversableOnce.scala:310)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.
> toBuffer(TraversableOnce.scala:302)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.toArray(
> TraversableOnce.scala:289)
>
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig:
> Compression: SNAPPY
>
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat:
> Parquet block size to 134217728
>
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat:
> Parquet page size to 1048576
>
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat:
> Parquet dictionary page size to 1048576
>
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat:
> Dictionary is on
>
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat:
> Validation is off
>
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat:
> Writer version is: PARQUET_1_0
>
> Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.InternalParquetRecordWriter:
> Flushing mem columnStore to file. allocated memory: 4,396,549
>
> Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.ColumnChunkPageWriteStore:
> written 4,157,541B for [labels, list, element] BINARY: 142,207 values,
> 5,600,131B raw, 4,156,878B comp, 6 pages, encodings: [PLAIN, RLE]
>
> 16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID
> 31, localhost): java.lang.OutOfMemoryError: Java heap space
>
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:222)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:205)
>
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
>
> at com.esotericsoftware.kryo.serializers.ObjectField.read(
> ObjectField.java:132)
>
> at com.esotericsoftware.kryo.serializers.FieldSerializer.
> read(FieldSerializer.java:551)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:42)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:33)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at org.apache.spark.serializer.KryoDeserializationStream.
> readObject(KryoSerializer.scala:229)
>
> at org.apache.spark.serializer.DeserializationStream.
> readValue(Serializer.scala:159)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
>
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$External
> Iterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$
> ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
>
> at scala.collection.mutable.ResizableArray$class.foreach(
> ResizableArray.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.foreach(
> ArrayBuffer.scala:48)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.generic.Growable$class.$plus$plus$eq(
> Growable.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:104)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:48)
>
> at scala.collection.TraversableOnce$class.to(
> TraversableOnce.scala:310)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.
> toBuffer(TraversableOnce.scala:302)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.toArray(
> TraversableOnce.scala:289)
>
>
>
> 16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
>
> 16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1
> times; aborting job
>
> 16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose
> tasks have all completed, from pool
>
> 16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
>
> 16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at
> NaiveBayes.scala:400) failed in 570.233 s
>
> 16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at
> NaiveBayes.scala:400, took 934.966523 s
>
> Exception in thread "main" org.apache.spark.SparkException: Job aborted
> due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent
> failure: Lost task 0.0 in stage 10.0 (TID 31, localhost):
> java.lang.OutOfMemoryError: Java heap space
>
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:222)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:205)
>
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
>
> at com.esotericsoftware.kryo.serializers.ObjectField.read(
> ObjectField.java:132)
>
> at com.esotericsoftware.kryo.serializers.FieldSerializer.
> read(FieldSerializer.java:551)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:42)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:33)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at org.apache.spark.serializer.KryoDeserializationStream.
> readObject(KryoSerializer.scala:229)
>
> at org.apache.spark.serializer.DeserializationStream.
> readValue(Serializer.scala:159)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
>
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$External
> Iterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$
> ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
>
> at scala.collection.mutable.ResizableArray$class.foreach(
> ResizableArray.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.foreach(
> ArrayBuffer.scala:48)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.generic.Growable$class.$plus$plus$eq(
> Growable.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:104)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:48)
>
> at scala.collection.TraversableOnce$class.to(
> TraversableOnce.scala:310)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.
> toBuffer(TraversableOnce.scala:302)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.toArray(
> TraversableOnce.scala:289)
>
>
>
> Driver stacktrace:
>
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$
> scheduler$DAGScheduler$$failJobAndIndependentStages(
> DAGScheduler.scala:1454)
>
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$
> abortStage$1.apply(DAGScheduler.scala:1442)
>
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$
> abortStage$1.apply(DAGScheduler.scala:1441)
>
> at scala.collection.mutable.ResizableArray$class.foreach(
> ResizableArray.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.foreach(
> ArrayBuffer.scala:48)
>
> at org.apache.spark.scheduler.DAGScheduler.abortStage(
> DAGScheduler.scala:1441)
>
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$
> handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
>
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$
> handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
>
> at scala.Option.foreach(Option.scala:257)
>
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(
> DAGScheduler.scala:811)
>
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> doOnReceive(DAGScheduler.scala:1667)
>
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> onReceive(DAGScheduler.scala:1622)
>
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> onReceive(DAGScheduler.scala:1611)
>
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>
> at org.apache.spark.scheduler.DAGScheduler.runJob(
> DAGScheduler.scala:632)
>
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
>
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
>
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
>
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
>
> at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.
> scala:912)
>
> at org.apache.spark.rdd.RDDOperationScope$.withScope(
> RDDOperationScope.scala:151)
>
> at org.apache.spark.rdd.RDDOperationScope$.withScope(
> RDDOperationScope.scala:112)
>
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
>
> at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
>
> at org.apache.spark.mllib.classification.NaiveBayes.run(
> NaiveBayes.scala:400)
>
> at org.apache.spark.mllib.classification.NaiveBayes$.
> train(NaiveBayes.scala:507)
>
> at org.apache.spark.ml.classification.NaiveBayes.
> train(NaiveBayes.scala:114)
>
> at org.apache.spark.ml.classification.NaiveBayes.
> train(NaiveBayes.scala:76)
>
> at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
>
> at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
>
> at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.
> scala:149)
>
> at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.
> scala:145)
>
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
>
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
>
> at scala.collection.IterableViewLike$Transformed$class.foreach(
> IterableViewLike.scala:44)
>
> at scala.collection.SeqViewLike$AbstractTransformed.foreach(
> SeqViewLike.scala:37)
>
> at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
>
> at main.scala.Test1$.main(Test1.scala:172)
>
> at main.scala.Test1.main(Test1.scala)
>
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>
> at sun.reflect.NativeMethodAccessorImpl.invoke(
> NativeMethodAccessorImpl.java:62)
>
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(
> DelegatingMethodAccessorImpl.java:43)
>
> at java.lang.reflect.Method.invoke(Method.java:498)
>
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$
> deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
>
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(
> SparkSubmit.scala:185)
>
> at org.apache.spark.deploy.SparkSubmit$.submit(
> SparkSubmit.scala:210)
>
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.
> scala:124)
>
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>
> Caused by: java.lang.OutOfMemoryError: Java heap space
>
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:222)
>
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$
> DoubleArraySerializer.read(DefaultArraySerializers.java:205)
>
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
>
> at com.esotericsoftware.kryo.serializers.ObjectField.read(
> ObjectField.java:132)
>
> at com.esotericsoftware.kryo.serializers.FieldSerializer.
> read(FieldSerializer.java:551)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:42)
>
> at com.twitter.chill.Tuple2Serializer.read(
> TupleSerializers.scala:33)
>
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.
> java:790)
>
> at org.apache.spark.serializer.KryoDeserializationStream.
> readObject(KryoSerializer.scala:229)
>
> at org.apache.spark.serializer.DeserializationStream.
> readValue(Serializer.scala:159)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
>
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$External
> Iterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$
> ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
>
> at scala.collection.mutable.ResizableArray$class.foreach(
> ResizableArray.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.foreach(
> ArrayBuffer.scala:48)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.generic.Growable$class.$plus$plus$eq(
> Growable.scala:59)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:104)
>
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(
> ArrayBuffer.scala:48)
>
> at scala.collection.TraversableOnce$class.to(
> TraversableOnce.scala:310)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.
> toBuffer(TraversableOnce.scala:302)
>
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$
> ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
>
> at scala.collection.TraversableOnce$class.toArray(
> TraversableOnce.scala:289)
>
> 16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at
> http://89.*************:4040
>
> 16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint:
> MapOutputTrackerMasterEndpoint stopped!
>
> 16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
>
> 16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
>
> 16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
>
> 16/10/29 23:41:28 INFO OutputCommitCoordinator$
> OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
>
> 16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
>
> 16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
>
> 16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory
> /tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce
>
>
>
--
[image: What's New with Xactly] <http://www.xactlycorp.com/email-click/>
<https://www.nyse.com/quote/XNYS:XTLY> [image: LinkedIn]
<https://www.linkedin.com/company/xactly-corporation> [image: Twitter]
<https://twitter.com/Xactly> [image: Facebook]
<https://www.facebook.com/XactlyCorp> [image: YouTube]
<http://www.youtube.com/xactlycorporation>
RE: Out Of Memory issue
Posted by Kürşat Kurt <ku...@kursatkurt.com>.
Hi Rohit;
Thank you for suggestions.
I re-partitioned dataframe, but getting same error.
Used parameters:
spark-submit --class main.scala.Test1 --master local[8] --driver-memory 58g /project_2.11-1.0.jar
System properties:
Mode: client
Free Mem: 60GB(Total 64GB)
OS: Ubuntu 14.04
Core : 8
Java:1.8
Could you have a look at the code below?
package main.scala
import java.util.Locale
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.sql.SparkSession
import com.hrzafer.reshaturkishstemmer.Resha
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.NGram
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.classification.LogisticRegression
import scala.collection.mutable.ListBuffer
import org.apache.spark.ml.classification.OneVsRest
import org.apache.spark.storage.StorageLevel
object Test1 {
var num = 50;
var savePath = "hdfs://localhost:54310/SparkWork/SparkModel/";
var stemmer = Resha.Instance
var STOP_WORDS: Set[String] = Set();
def cropSentence(s: String) = {
s.replaceAll("\\([^\\)]*\\)", "")
.replaceAll("(\\d+)(gb|GB)", "$1 $2")
.replaceAll(" - ", " ")
.replaceAll("-", " ")
.replaceAll(" tr. ", " ")
.replaceAll(" +", " ")
.replaceAll(",", " ").trim();
}
def main(args: Array[String]): Unit = {
val start1 = System.currentTimeMillis();
val sc = new SparkConf().setAppName("Test")
.set("spark.hadoop.validateOutputSpecs", "false")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
.set("spark.kryoserializer.buffer.max","1g")
.set("spark.driver.maxResultSize","0")
.set("spark.sql.shuffle.partitions","2001")
.set("spark.sql.warehouse.dir", "hdfs://localhost:54310/SparkWork/wh")
val spark = SparkSession.builder.appName("Java Spark").config(sc).getOrCreate();
import spark.implicits._
val mainDataset = spark.sparkContext.textFile("hdfs://localhost:54310/SparkWork/classifications.csv",5)
.map( _.split("ß"))
.map(tokens => {
//println(tokens(0));
var list=new ListBuffer[String]();
var token0=cropSentence(tokens(0).toLowerCase(Locale.forLanguageTag("TR-tr")));
token0.split("\\s+").map {list+=stemmer.stem(_)}
(tokens(1), list.toList.mkString(" "))
}).toDF("className","productName");
val classIndexer = new StringIndexer()
.setInputCol("className")
.setOutputCol("label");
val classIndexerModel = classIndexer.fit(mainDataset);
var mainDS=classIndexerModel.transform(mainDataset);
classIndexerModel.write.overwrite.save(savePath + "ClassIndexer");
//Tokenizer
val tokenizer = new Tokenizer()
.setInputCol("productName")
.setOutputCol("words_nonfiltered")
;
//StopWords
val remover = new StopWordsRemover()
.setInputCol("words_nonfiltered")
.setOutputCol("words")
.setStopWords( Array[String]("prod1","prod2","-","//"));
//CountVectorize
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features");
val nb = new NaiveBayes()
.setSmoothing(0.1)
.setModelType("multinomial")
val pipeline = new Pipeline().setStages(Array(tokenizer,remover,countVectorizer,nb));
val train =mainDS.repartition(500);
val model = pipeline.fit(train); <=========== Out of memory
}
}
}
From: Rohit Kumar Prusty [mailto:Rohit_Prusty@infosys.com]
Sent: Wednesday, November 2, 2016 8:51 AM
To: Kürşat Kurt <ku...@kursatkurt.com>; 'Patrick Chen' <cz...@gmail.com>
Cc: user@spark.apache.org
Subject: RE: Out Of Memory issue
In an article “Top 5 Mistakes when writing Spark applications”, I figured this out. Check if this helps.
In error stack trace I see below:
java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
This issue occurs when, Spark shuffle block is greater than 2 GB. In MapReduce terminology, a file written from one Mapper for a Reducer. The Reducer makes a local copy of this file (reducer local copy) and then ‘reduces’ it.
Overflow exception occurs, if shuffle block size > 2 GB. Especially it is problematic for Spark SQL
• Default number of partitions to use when doing shuffles is 200
– This low number of partitions leads to high shuffle block size
Solution:
1. Increase the number of partitions
– Thereby, reducing the average partition size
2. Get rid of skew in your data
– More on that later
• In Spark SQL, increase the value of “spark.sql.shuffle.partitions”
• In regular Spark applications, use rdd.repartition()
Regards
Rohit Kumar Prusty
+91-9884070075
From: Kürşat Kurt [ <ma...@kursatkurt.com> mailto:kursat@kursatkurt.com]
Sent: Tuesday, November 01, 2016 11:54 AM
To: 'Patrick Chen' < <ma...@gmail.com> czhenjupt@gmail.com>
Cc: <ma...@spark.apache.org> user@spark.apache.org
Subject: RE: Out Of Memory issue
It is ok, but now getting “Size exceeds Integer.MAX_VALUE”
16/10/31 21:53:20 WARN MemoryStore: Not enough space to cache rdd_42_0 in memory! (computed 11.3 GB so far)
16/10/31 21:53:20 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 28.2 GB (scratch space shared across 2 tasks(s)) = 28.4 GB. Storage limit = 31.8 GB.
16/10/31 21:53:20 WARN BlockManager: Persisting block rdd_42_0 to disk instead.
16/10/31 21:56:31 INFO MemoryStore: Will not store rdd_42_1
16/10/31 21:56:31 WARN MemoryStore: Not enough space to cache rdd_42_1 in memory! (computed 25.5 GB so far)
16/10/31 21:56:31 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 25.4 GB (scratch space shared across 1 tasks(s)) = 25.6 GB. Storage limit = 31.8 GB.
16/10/31 21:56:31 WARN BlockManager: Persisting block rdd_42_1 to disk instead.
16/10/31 22:03:13 INFO BlockManagerInfo: Added rdd_42_1 on disk on 89.163.242.124:51975 (size: 12.6 GB)
16/10/31 22:03:13 ERROR Executor: Exception in task 1.0 in stage 13.0 (TID 36)
java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/10/31 22:03:13 WARN TaskSetManager: Lost task 1.0 in stage 13.0 (TID 36, localhost): java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/10/31 22:03:13 ERROR TaskSetManager: Task 1 in stage 13.0 failed 1 times; aborting job
16/10/31 22:03:13 INFO TaskSchedulerImpl: Cancelling stage 13
16/10/31 22:03:13 INFO TaskSchedulerImpl: Stage 13 was cancelled
16/10/31 22:03:13 INFO Executor: Executor is trying to kill task 0.0 in stage 13.0 (TID 35)
16/10/31 22:03:13 INFO DAGScheduler: ShuffleMapStage 13 (mapPartitions at RandomForest.scala:521) failed in 763.029 s
16/10/31 22:03:13 INFO DAGScheduler: Job 8 failed: collectAsMap at RandomForest.scala:550, took 763.405717 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 13.0 failed 1 times, most recent failure: Lost task 1.0 in stage 13.0 (TID 36, localhost): java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
From: Patrick Chen [ <ma...@gmail.com> mailto:czhenjupt@gmail.com]
Sent: Tuesday, November 1, 2016 8:09 AM
To: Kürşat Kurt < <ma...@kursatkurt.com> kursat@kursatkurt.com>
Subject: Re: Out Of Memory issue
I think you should set more memory on your heap (JAVA_OPTS -Xmx) , try again
2016-11-01 12:20 GMT+08:00 Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> >:
Any idea about this?
From: Kürşat Kurt [mailto: <ma...@kursatkurt.com> kursat@kursatkurt.com]
Sent: Sunday, October 30, 2016 7:59 AM
To: 'Jörn Franke' < <ma...@gmail.com> jornfranke@gmail.com>
Cc: ' <ma...@spark.apache.org> user@spark.apache.org' < <ma...@spark.apache.org> user@spark.apache.org>
Subject: RE: Out Of Memory issue
Hi Jörn;
I am reading 300.000 line csv file. It is “ß” seperated(attached sample file). First column is class name and second column is product name.
Java version is 1.8.108, single node. Furthermore (as you can see in code) i tried random forests and this get OMM too.
Code :
package main.scala
import java.util.Locale
import org.apache.spark.SparkConf
import <http://org.apache.spark.ml> org.apache.spark.ml.classification.RandomForestClassifier
import <http://org.apache.spark.ml> org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.sql.SparkSession
import com.hrzafer.reshaturkishstemmer.Resha
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.NGram
import <http://org.apache.spark.ml> org.apache.spark.ml.classification.NaiveBayes
import <http://org.apache.spark.ml> org.apache.spark.ml.classification.DecisionTreeClassificationModel
import <http://org.apache.spark.ml> org.apache.spark.ml.classification.DecisionTreeClassifier
import <http://org.apache.spark.ml> org.apache.spark.ml.classification.LogisticRegression
import scala.collection.mutable.ListBuffer
import <http://org.apache.spark.ml> org.apache.spark.ml.classification.OneVsRest
import org.apache.spark.storage.StorageLevel
object Test1 {
var num = 50;
var savePath = "hdfs://localhost:54310/SparkWork/SparkModel/";
var stemmer = Resha.Instance
var STOP_WORDS: Set[String] = Set();
def cropSentence(s: String) = {
s.replaceAll("\\([^\\)]*\\)", "")
.replaceAll("( <file://d+)(gb|GB)> \\d+)(gb|GB)", "$1 $2")
.replaceAll(" - ", " ")
.replaceAll("-", " ")
.replaceAll(" tr. ", " ")
.replaceAll(" +", " ")
.replaceAll(",", " ").trim();
}
def main(args: Array[String]): Unit = {
val start1 = System.currentTimeMillis();
val sc = new SparkConf().setAppName("Test")
.set("spark.hadoop.validateOutputSpecs", "false")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val spark = SparkSession.builder.appName("Java Spark").config(sc).getOrCreate();
import spark.implicits._
val mainDataset = spark.sparkContext.textFile("hdfs://localhost:54310/SparkWork/classifications.csv")
.map( _.split("ß"))
.map(tokens => {
var list=new ListBuffer[String]();
var token0=cropSentence(tokens(0).toLowerCase(Locale.forLanguageTag("TR-tr")));
token0.split(" <file://s+> \\s+").map {list+=stemmer.stem(_)}
(tokens(1), list.toList.mkString(" "))
}).persist(StorageLevel.MEMORY_AND_DISK).toDF("className","productName");
val classIndexer = new StringIndexer()
.setInputCol("className")
.setOutputCol("label");
val classIndexerModel = classIndexer.fit(mainDataset);
var mainDS=classIndexerModel.transform(mainDataset);
classIndexerModel.write.overwrite.save(savePath + "ClassIndexer");
//Tokenizer
val tokenizer = new Tokenizer()
.setInputCol("productName")
.setOutputCol("words_nonfiltered")
;
//StopWords
val remover = new StopWordsRemover()
.setInputCol("words_nonfiltered")
.setOutputCol("words")
.setStopWords( Array[String]("garanti","garantili","resmi","distribütör","cep","tel","-","//"));
//CountVectorize
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features");
val rfc = new RandomForestClassifier ()
.setLabelCol("label")
.setNumTrees(3)
.setMaxDepth(3)
.setFeatureSubsetStrategy("auto")
.setFeaturesCol("features")
.setImpurity("gini")
.setMaxBins(3);
val nb = new NaiveBayes()
.setSmoothing(0.1)
.setModelType("multinomial")
val pipeline = new Pipeline().setStages(Array(tokenizer,remover,countVectorizer,nb));
val splits = mainDS.randomSplit(Array(0.80,0.20));
val train =splits(0);
//train.show(num,false);
val test = splits(1);
//test.show(num,false);
//mainDataset.show(100,false);
val model = pipeline.fit(train);
model.write.overwrite.save(savePath+"RandomForestClassifier");
//var model=rfc.fit(train);
var result = model.transform(test);
val predictionAndLabels = result.select("prediction", "label");
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy");
System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
}
}
From: Jörn Franke [ <ma...@gmail.com> mailto:jornfranke@gmail.com]
Sent: Sunday, October 30, 2016 12:44 AM
To: Kürşat Kurt < <ma...@kursatkurt.com> kursat@kursatkurt.com>
Cc: <ma...@spark.apache.org> user@spark.apache.org
Subject: Re: Out Of Memory issue
What is the size and format of the input data?
Can you provide more details on your Spark job? Rdd? Dataframe? Etc. Java Version? Is this a single node? It seems your executors and os do not get a lot of memory
On 29 Oct 2016, at 22:51, Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> > wrote:
Hi;
While training NaiveBayes classification, i am getting OOM.
What is wrong with these parameters?
Here is the spark-submit command: ./spark-submit --class main.scala.Test1 --master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark 2.0.1.
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on 89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on 89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (1 time so far)
16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (2 times so far)
16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (3 times so far)
16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76 <ma...@43ab2e76>
16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID 31)
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-7,5,main]
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig: Compression: SNAPPY
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet block size to 134217728
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet dictionary page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Dictionary is on
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Validation is off
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Writer version is: PARQUET_1_0
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem columnStore to file. allocated memory: 4,396,549
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.ColumnChunkPageWriteStore: written 4,157,541B for [labels, list, element] BINARY: 142,207 values, 5,600,131B raw, 4,156,878B comp, 6 pages, encodings: [PLAIN, RLE]
16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1 times; aborting job
16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose tasks have all completed, from pool
16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at NaiveBayes.scala:400) failed in 570.233 s
16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at NaiveBayes.scala:400, took 934.966523 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org <http://org.apache.spark.scheduler.DAGScheduler.org> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:400)
at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:507)
at org.apache.spark.ml <http://org.apache.spark.ml> .classification.NaiveBayes.train(NaiveBayes.scala:114)
at org.apache.spark.ml <http://org.apache.spark.ml> .classification.NaiveBayes.train(NaiveBayes.scala:76)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:149)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:145)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableViewLike$Transformed$class.foreach(IterableViewLike.scala:44)
at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
at main.scala.Test1$.main(Test1.scala:172)
at main.scala.Test1.main(Test1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at http://89.*************:4040
16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
16/10/29 23:41:28 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory /tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce
RE: Out Of Memory issue
Posted by Rohit Kumar Prusty <Ro...@infosys.com>.
In an article “Top 5 Mistakes when writing Spark applications”, I figured this out. Check if this helps.
In error stack trace I see below:
java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
This issue occurs when, Spark shuffle block is greater than 2 GB. In MapReduce terminology, a file written from one Mapper for a Reducer. The Reducer makes a local copy of this file (reducer local copy) and then ‘reduces’ it.
Overflow exception occurs, if shuffle block size > 2 GB. Especially it is problematic for Spark SQL
• Default number of partitions to use when doing shuffles is 200
– This low number of partitions leads to high shuffle block size
Solution:
1. Increase the number of partitions
– Thereby, reducing the average partition size
2. Get rid of skew in your data
– More on that later
• In Spark SQL, increase the value of “spark.sql.shuffle.partitions”
• In regular Spark applications, use rdd.repartition()
Regards
Rohit Kumar Prusty
+91-9884070075
From: Kürşat Kurt [mailto:kursat@kursatkurt.com]
Sent: Tuesday, November 01, 2016 11:54 AM
To: 'Patrick Chen' <cz...@gmail.com>
Cc: user@spark.apache.org
Subject: RE: Out Of Memory issue
It is ok, but now getting “Size exceeds Integer.MAX_VALUE”
16/10/31 21:53:20 WARN MemoryStore: Not enough space to cache rdd_42_0 in memory! (computed 11.3 GB so far)
16/10/31 21:53:20 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 28.2 GB (scratch space shared across 2 tasks(s)) = 28.4 GB. Storage limit = 31.8 GB.
16/10/31 21:53:20 WARN BlockManager: Persisting block rdd_42_0 to disk instead.
16/10/31 21:56:31 INFO MemoryStore: Will not store rdd_42_1
16/10/31 21:56:31 WARN MemoryStore: Not enough space to cache rdd_42_1 in memory! (computed 25.5 GB so far)
16/10/31 21:56:31 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 25.4 GB (scratch space shared across 1 tasks(s)) = 25.6 GB. Storage limit = 31.8 GB.
16/10/31 21:56:31 WARN BlockManager: Persisting block rdd_42_1 to disk instead.
16/10/31 22:03:13 INFO BlockManagerInfo: Added rdd_42_1 on disk on 89.163.242.124:51975 (size: 12.6 GB)
16/10/31 22:03:13 ERROR Executor: Exception in task 1.0 in stage 13.0 (TID 36)
java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/10/31 22:03:13 WARN TaskSetManager: Lost task 1.0 in stage 13.0 (TID 36, localhost): java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/10/31 22:03:13 ERROR TaskSetManager: Task 1 in stage 13.0 failed 1 times; aborting job
16/10/31 22:03:13 INFO TaskSchedulerImpl: Cancelling stage 13
16/10/31 22:03:13 INFO TaskSchedulerImpl: Stage 13 was cancelled
16/10/31 22:03:13 INFO Executor: Executor is trying to kill task 0.0 in stage 13.0 (TID 35)
16/10/31 22:03:13 INFO DAGScheduler: ShuffleMapStage 13 (mapPartitions at RandomForest.scala:521) failed in 763.029 s
16/10/31 22:03:13 INFO DAGScheduler: Job 8 failed: collectAsMap at RandomForest.scala:550, took 763.405717 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 13.0 failed 1 times, most recent failure: Lost task 1.0 in stage 13.0 (TID 36, localhost): java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
From: Patrick Chen [mailto:czhenjupt@gmail.com]
Sent: Tuesday, November 1, 2016 8:09 AM
To: Kürşat Kurt <ku...@kursatkurt.com>>
Subject: Re: Out Of Memory issue
I think you should set more memory on your heap (JAVA_OPTS -Xmx) , try again
2016-11-01 12:20 GMT+08:00 Kürşat Kurt <ku...@kursatkurt.com>>:
Any idea about this?
From: Kürşat Kurt [mailto:kursat@kursatkurt.com<ma...@kursatkurt.com>]
Sent: Sunday, October 30, 2016 7:59 AM
To: 'Jörn Franke' <jo...@gmail.com>>
Cc: 'user@spark.apache.org<ma...@spark.apache.org>' <us...@spark.apache.org>>
Subject: RE: Out Of Memory issue
Hi Jörn;
I am reading 300.000 line csv file. It is “ß” seperated(attached sample file). First column is class name and second column is product name.
Java version is 1.8.108, single node. Furthermore (as you can see in code) i tried random forests and this get OMM too.
Code :
package main.scala
import java.util.Locale
import org.apache.spark.SparkConf
import org.apache.spark.ml<http://org.apache.spark.ml>.classification.RandomForestClassifier
import org.apache.spark.ml<http://org.apache.spark.ml>.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.sql.SparkSession
import com.hrzafer.reshaturkishstemmer.Resha
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.NGram
import org.apache.spark.ml<http://org.apache.spark.ml>.classification.NaiveBayes
import org.apache.spark.ml<http://org.apache.spark.ml>.classification.DecisionTreeClassificationModel
import org.apache.spark.ml<http://org.apache.spark.ml>.classification.DecisionTreeClassifier
import org.apache.spark.ml<http://org.apache.spark.ml>.classification.LogisticRegression
import scala.collection.mutable.ListBuffer
import org.apache.spark.ml<http://org.apache.spark.ml>.classification.OneVsRest
import org.apache.spark.storage.StorageLevel
object Test1 {
var num = 50;
var savePath = "hdfs://localhost:54310/SparkWork/SparkModel/";
var stemmer = Resha.Instance
var STOP_WORDS: Set[String] = Set();
def cropSentence(s: String) = {
s.replaceAll("\\([^\\)]*\\)", "")
.replaceAll("(\\d+)(gb|GB)<file:///\\d+)(gb|GB)>", "$1 $2")
.replaceAll(" - ", " ")
.replaceAll("-", " ")
.replaceAll(" tr. ", " ")
.replaceAll(" +", " ")
.replaceAll(",", " ").trim();
}
def main(args: Array[String]): Unit = {
val start1 = System.currentTimeMillis();
val sc = new SparkConf().setAppName("Test")
.set("spark.hadoop.validateOutputSpecs", "false")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val spark = SparkSession.builder.appName("Java Spark").config(sc).getOrCreate();
import spark.implicits._
val mainDataset = spark.sparkContext.textFile("hdfs://localhost:54310/SparkWork/classifications.csv")
.map( _.split("ß"))
.map(tokens => {
var list=new ListBuffer[String]();
var token0=cropSentence(tokens(0).toLowerCase(Locale.forLanguageTag("TR-tr")));
token0.split("\\s+<file:///\\s+>").map {list+=stemmer.stem(_)}
(tokens(1), list.toList.mkString(" "))
}).persist(StorageLevel.MEMORY_AND_DISK).toDF("className","productName");
val classIndexer = new StringIndexer()
.setInputCol("className")
.setOutputCol("label");
val classIndexerModel = classIndexer.fit(mainDataset);
var mainDS=classIndexerModel.transform(mainDataset);
classIndexerModel.write.overwrite.save(savePath + "ClassIndexer");
//Tokenizer
val tokenizer = new Tokenizer()
.setInputCol("productName")
.setOutputCol("words_nonfiltered")
;
//StopWords
val remover = new StopWordsRemover()
.setInputCol("words_nonfiltered")
.setOutputCol("words")
.setStopWords( Array[String]("garanti","garantili","resmi","distribütör","cep","tel","-","//"));
//CountVectorize
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features");
val rfc = new RandomForestClassifier ()
.setLabelCol("label")
.setNumTrees(3)
.setMaxDepth(3)
.setFeatureSubsetStrategy("auto")
.setFeaturesCol("features")
.setImpurity("gini")
.setMaxBins(3);
val nb = new NaiveBayes()
.setSmoothing(0.1)
.setModelType("multinomial")
val pipeline = new Pipeline().setStages(Array(tokenizer,remover,countVectorizer,nb));
val splits = mainDS.randomSplit(Array(0.80,0.20));
val train =splits(0);
//train.show(num,false);
val test = splits(1);
//test.show(num,false);
//mainDataset.show(100,false);
val model = pipeline.fit(train);
model.write.overwrite.save(savePath+"RandomForestClassifier");
//var model=rfc.fit(train);
var result = model.transform(test);
val predictionAndLabels = result.select("prediction", "label");
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy");
System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
}
}
From: Jörn Franke [mailto:jornfranke@gmail.com]
Sent: Sunday, October 30, 2016 12:44 AM
To: Kürşat Kurt <ku...@kursatkurt.com>>
Cc: user@spark.apache.org<ma...@spark.apache.org>
Subject: Re: Out Of Memory issue
What is the size and format of the input data?
Can you provide more details on your Spark job? Rdd? Dataframe? Etc. Java Version? Is this a single node? It seems your executors and os do not get a lot of memory
On 29 Oct 2016, at 22:51, Kürşat Kurt <ku...@kursatkurt.com>> wrote:
Hi;
While training NaiveBayes classification, i am getting OOM.
What is wrong with these parameters?
Here is the spark-submit command: ./spark-submit --class main.scala.Test1 --master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark 2.0.1.
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on 89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on 89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (1 time so far)
16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (2 times so far)
16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (3 times so far)
16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76<ma...@43ab2e76>
16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID 31)
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io<http://com.esotericsoftware.kryo.io>.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org<http://ExternalIterator.org>$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to<http://class.to>(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-7,5,main]
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io<http://com.esotericsoftware.kryo.io>.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org<http://ExternalIterator.org>$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to<http://class.to>(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig: Compression: SNAPPY
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet block size to 134217728
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet dictionary page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Dictionary is on
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Validation is off
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Writer version is: PARQUET_1_0
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem columnStore to file. allocated memory: 4,396,549
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.ColumnChunkPageWriteStore: written 4,157,541B for [labels, list, element] BINARY: 142,207 values, 5,600,131B raw, 4,156,878B comp, 6 pages, encodings: [PLAIN, RLE]
16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io<http://com.esotericsoftware.kryo.io>.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org<http://ExternalIterator.org>$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to<http://class.to>(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1 times; aborting job
16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose tasks have all completed, from pool
16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at NaiveBayes.scala:400) failed in 570.233 s
16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at NaiveBayes.scala:400, took 934.966523 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io<http://com.esotericsoftware.kryo.io>.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org<http://ExternalIterator.org>$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to<http://class.to>(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org<http://org.apache.spark.scheduler.DAGScheduler.org>$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:400)
at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:507)
at org.apache.spark.ml<http://org.apache.spark.ml>.classification.NaiveBayes.train(NaiveBayes.scala:114)
at org.apache.spark.ml<http://org.apache.spark.ml>.classification.NaiveBayes.train(NaiveBayes.scala:76)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:149)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:145)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableViewLike$Transformed$class.foreach(IterableViewLike.scala:44)
at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
at main.scala.Test1$.main(Test1.scala:172)
at main.scala.Test1.main(Test1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io<http://com.esotericsoftware.kryo.io>.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org<http://ExternalIterator.org>$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to<http://class.to>(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at http://89.*************:4040
16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
16/10/29 23:41:28 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory /tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce
RE: Out Of Memory issue
Posted by Kürşat Kurt <ku...@kursatkurt.com>.
It is ok, but now getting “Size exceeds Integer.MAX_VALUE”
16/10/31 21:53:20 WARN MemoryStore: Not enough space to cache rdd_42_0 in memory! (computed 11.3 GB so far)
16/10/31 21:53:20 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 28.2 GB (scratch space shared across 2 tasks(s)) = 28.4 GB. Storage limit = 31.8 GB.
16/10/31 21:53:20 WARN BlockManager: Persisting block rdd_42_0 to disk instead.
16/10/31 21:56:31 INFO MemoryStore: Will not store rdd_42_1
16/10/31 21:56:31 WARN MemoryStore: Not enough space to cache rdd_42_1 in memory! (computed 25.5 GB so far)
16/10/31 21:56:31 INFO MemoryStore: Memory use = 126.4 MB (blocks) + 25.4 GB (scratch space shared across 1 tasks(s)) = 25.6 GB. Storage limit = 31.8 GB.
16/10/31 21:56:31 WARN BlockManager: Persisting block rdd_42_1 to disk instead.
16/10/31 22:03:13 INFO BlockManagerInfo: Added rdd_42_1 on disk on 89.163.242.124:51975 (size: 12.6 GB)
16/10/31 22:03:13 ERROR Executor: Exception in task 1.0 in stage 13.0 (TID 36)
java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/10/31 22:03:13 WARN TaskSetManager: Lost task 1.0 in stage 13.0 (TID 36, localhost): java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/10/31 22:03:13 ERROR TaskSetManager: Task 1 in stage 13.0 failed 1 times; aborting job
16/10/31 22:03:13 INFO TaskSchedulerImpl: Cancelling stage 13
16/10/31 22:03:13 INFO TaskSchedulerImpl: Stage 13 was cancelled
16/10/31 22:03:13 INFO Executor: Executor is trying to kill task 0.0 in stage 13.0 (TID 35)
16/10/31 22:03:13 INFO DAGScheduler: ShuffleMapStage 13 (mapPartitions at RandomForest.scala:521) failed in 763.029 s
16/10/31 22:03:13 INFO DAGScheduler: Job 8 failed: collectAsMap at RandomForest.scala:550, took 763.405717 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 13.0 failed 1 times, most recent failure: Lost task 1.0 in stage 13.0 (TID 36, localhost): java.lang.IllegalArgumentException: Size exceeds Integer.MAX_VALUE
at sun.nio.ch.FileChannelImpl.map(FileChannelImpl.java:869)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:103)
at org.apache.spark.storage.DiskStore$$anonfun$getBytes$2.apply(DiskStore.scala:91)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1307)
at org.apache.spark.storage.DiskStore.getBytes(DiskStore.scala:105)
at org.apache.spark.storage.BlockManager.getLocalValues(BlockManager.scala:438)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:674)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:86)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
From: Patrick Chen [mailto:czhenjupt@gmail.com]
Sent: Tuesday, November 1, 2016 8:09 AM
To: Kürşat Kurt <ku...@kursatkurt.com>
Subject: Re: Out Of Memory issue
I think you should set more memory on your heap (JAVA_OPTS -Xmx) , try again
2016-11-01 12:20 GMT+08:00 Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> >:
Any idea about this?
From: Kürşat Kurt [mailto:kursat@kursatkurt.com <ma...@kursatkurt.com> ]
Sent: Sunday, October 30, 2016 7:59 AM
To: 'Jörn Franke' <jornfranke@gmail.com <ma...@gmail.com> >
Cc: 'user@spark.apache.org <ma...@spark.apache.org> ' <user@spark.apache.org <ma...@spark.apache.org> >
Subject: RE: Out Of Memory issue
Hi Jörn;
I am reading 300.000 line csv file. It is “ß” seperated(attached sample file). First column is class name and second column is product name.
Java version is 1.8.108, single node. Furthermore (as you can see in code) i tried random forests and this get OMM too.
Code :
package main.scala
import java.util.Locale
import org.apache.spark.SparkConf
import org.apache.spark.ml <http://org.apache.spark.ml> .classification.RandomForestClassifier
import org.apache.spark.ml <http://org.apache.spark.ml> .evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.sql.SparkSession
import com.hrzafer.reshaturkishstemmer.Resha
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.NGram
import org.apache.spark.ml <http://org.apache.spark.ml> .classification.NaiveBayes
import org.apache.spark.ml <http://org.apache.spark.ml> .classification.DecisionTreeClassificationModel
import org.apache.spark.ml <http://org.apache.spark.ml> .classification.DecisionTreeClassifier
import org.apache.spark.ml <http://org.apache.spark.ml> .classification.LogisticRegression
import scala.collection.mutable.ListBuffer
import org.apache.spark.ml <http://org.apache.spark.ml> .classification.OneVsRest
import org.apache.spark.storage.StorageLevel
object Test1 {
var num = 50;
var savePath = "hdfs://localhost:54310/SparkWork/SparkModel/";
var stemmer = Resha.Instance
var STOP_WORDS: Set[String] = Set();
def cropSentence(s: String) = {
s.replaceAll("\\([^\\)]*\\)", "")
.replaceAll("(\\d+)(gb|GB) <file://d+)(gb|GB)> ", "$1 $2")
.replaceAll(" - ", " ")
.replaceAll("-", " ")
.replaceAll(" tr. ", " ")
.replaceAll(" +", " ")
.replaceAll(",", " ").trim();
}
def main(args: Array[String]): Unit = {
val start1 = System.currentTimeMillis();
val sc = new SparkConf().setAppName("Test")
.set("spark.hadoop.validateOutputSpecs", "false")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val spark = SparkSession.builder.appName("Java Spark").config(sc).getOrCreate();
import spark.implicits._
val mainDataset = spark.sparkContext.textFile("hdfs://localhost:54310/SparkWork/classifications.csv")
.map( _.split("ß"))
.map(tokens => {
var list=new ListBuffer[String]();
var token0=cropSentence(tokens(0).toLowerCase(Locale.forLanguageTag("TR-tr")));
token0.split("\\s+ <file://s+> ").map {list+=stemmer.stem(_)}
(tokens(1), list.toList.mkString(" "))
}).persist(StorageLevel.MEMORY_AND_DISK).toDF("className","productName");
val classIndexer = new StringIndexer()
.setInputCol("className")
.setOutputCol("label");
val classIndexerModel = classIndexer.fit(mainDataset);
var mainDS=classIndexerModel.transform(mainDataset);
classIndexerModel.write.overwrite.save(savePath + "ClassIndexer");
//Tokenizer
val tokenizer = new Tokenizer()
.setInputCol("productName")
.setOutputCol("words_nonfiltered")
;
//StopWords
val remover = new StopWordsRemover()
.setInputCol("words_nonfiltered")
.setOutputCol("words")
.setStopWords( Array[String]("garanti","garantili","resmi","distribütör","cep","tel","-","//"));
//CountVectorize
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features");
val rfc = new RandomForestClassifier ()
.setLabelCol("label")
.setNumTrees(3)
.setMaxDepth(3)
.setFeatureSubsetStrategy("auto")
.setFeaturesCol("features")
.setImpurity("gini")
.setMaxBins(3);
val nb = new NaiveBayes()
.setSmoothing(0.1)
.setModelType("multinomial")
val pipeline = new Pipeline().setStages(Array(tokenizer,remover,countVectorizer,nb));
val splits = mainDS.randomSplit(Array(0.80,0.20));
val train =splits(0);
//train.show(num,false);
val test = splits(1);
//test.show(num,false);
//mainDataset.show(100,false);
val model = pipeline.fit(train);
model.write.overwrite.save(savePath+"RandomForestClassifier");
//var model=rfc.fit(train);
var result = model.transform(test);
val predictionAndLabels = result.select("prediction", "label");
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy");
System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
}
}
From: Jörn Franke [mailto:jornfranke@gmail.com]
Sent: Sunday, October 30, 2016 12:44 AM
To: Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> >
Cc: user@spark.apache.org <ma...@spark.apache.org>
Subject: Re: Out Of Memory issue
What is the size and format of the input data?
Can you provide more details on your Spark job? Rdd? Dataframe? Etc. Java Version? Is this a single node? It seems your executors and os do not get a lot of memory
On 29 Oct 2016, at 22:51, Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> > wrote:
Hi;
While training NaiveBayes classification, i am getting OOM.
What is wrong with these parameters?
Here is the spark-submit command: ./spark-submit --class main.scala.Test1 --master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark 2.0.1.
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on 89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on 89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (1 time so far)
16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (2 times so far)
16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (3 times so far)
16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76 <ma...@43ab2e76>
16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID 31)
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-7,5,main]
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig: Compression: SNAPPY
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet block size to 134217728
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet dictionary page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Dictionary is on
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Validation is off
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Writer version is: PARQUET_1_0
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem columnStore to file. allocated memory: 4,396,549
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.ColumnChunkPageWriteStore: written 4,157,541B for [labels, list, element] BINARY: 142,207 values, 5,600,131B raw, 4,156,878B comp, 6 pages, encodings: [PLAIN, RLE]
16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1 times; aborting job
16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose tasks have all completed, from pool
16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at NaiveBayes.scala:400) failed in 570.233 s
16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at NaiveBayes.scala:400, took 934.966523 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org <http://org.apache.spark.scheduler.DAGScheduler.org> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:400)
at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:507)
at org.apache.spark.ml <http://org.apache.spark.ml> .classification.NaiveBayes.train(NaiveBayes.scala:114)
at org.apache.spark.ml <http://org.apache.spark.ml> .classification.NaiveBayes.train(NaiveBayes.scala:76)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:149)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:145)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableViewLike$Transformed$class.foreach(IterableViewLike.scala:44)
at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
at main.scala.Test1$.main(Test1.scala:172)
at main.scala.Test1.main(Test1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io <http://com.esotericsoftware.kryo.io> .Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to <http://class.to> (TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at http://89.*************:4040
16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
16/10/29 23:41:28 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory /tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce
RE: Out Of Memory issue
Posted by Kürşat Kurt <ku...@kursatkurt.com>.
Any idea about this?
From: Kürşat Kurt [mailto:kursat@kursatkurt.com]
Sent: Sunday, October 30, 2016 7:59 AM
To: 'Jörn Franke' <jo...@gmail.com>
Cc: 'user@spark.apache.org' <us...@spark.apache.org>
Subject: RE: Out Of Memory issue
Hi Jörn;
I am reading 300.000 line csv file. It is “ß” seperated(attached sample file). First column is class name and second column is product name.
Java version is 1.8.108, single node. Furthermore (as you can see in code) i tried random forests and this get OMM too.
Code :
package main.scala
import java.util.Locale
import org.apache.spark.SparkConf
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.feature.IndexToString
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.feature.StopWordsRemover
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.sql.SparkSession
import com.hrzafer.reshaturkishstemmer.Resha
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.NGram
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.classification.LogisticRegression
import scala.collection.mutable.ListBuffer
import org.apache.spark.ml.classification.OneVsRest
import org.apache.spark.storage.StorageLevel
object Test1 {
var num = 50;
var savePath = "hdfs://localhost:54310/SparkWork/SparkModel/";
var stemmer = Resha.Instance
var STOP_WORDS: Set[String] = Set();
def cropSentence(s: String) = {
s.replaceAll("\\([^\\)]*\\)", "")
.replaceAll("(\\d+)(gb|GB) <file://d+)(gb|GB)> ", "$1 $2")
.replaceAll(" - ", " ")
.replaceAll("-", " ")
.replaceAll(" tr. ", " ")
.replaceAll(" +", " ")
.replaceAll(",", " ").trim();
}
def main(args: Array[String]): Unit = {
val start1 = System.currentTimeMillis();
val sc = new SparkConf().setAppName("Test")
.set("spark.hadoop.validateOutputSpecs", "false")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val spark = SparkSession.builder.appName("Java Spark").config(sc).getOrCreate();
import spark.implicits._
val mainDataset = spark.sparkContext.textFile("hdfs://localhost:54310/SparkWork/classifications.csv")
.map( _.split("ß"))
.map(tokens => {
var list=new ListBuffer[String]();
var token0=cropSentence(tokens(0).toLowerCase(Locale.forLanguageTag("TR-tr")));
token0.split("\\s+ <file://s+> ").map {list+=stemmer.stem(_)}
(tokens(1), list.toList.mkString(" "))
}).persist(StorageLevel.MEMORY_AND_DISK).toDF("className","productName");
val classIndexer = new StringIndexer()
.setInputCol("className")
.setOutputCol("label");
val classIndexerModel = classIndexer.fit(mainDataset);
var mainDS=classIndexerModel.transform(mainDataset);
classIndexerModel.write.overwrite.save(savePath + "ClassIndexer");
//Tokenizer
val tokenizer = new Tokenizer()
.setInputCol("productName")
.setOutputCol("words_nonfiltered")
;
//StopWords
val remover = new StopWordsRemover()
.setInputCol("words_nonfiltered")
.setOutputCol("words")
.setStopWords( Array[String]("garanti","garantili","resmi","distribütör","cep","tel","-","//"));
//CountVectorize
val countVectorizer = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features");
val rfc = new RandomForestClassifier ()
.setLabelCol("label")
.setNumTrees(3)
.setMaxDepth(3)
.setFeatureSubsetStrategy("auto")
.setFeaturesCol("features")
.setImpurity("gini")
.setMaxBins(3);
val nb = new NaiveBayes()
.setSmoothing(0.1)
.setModelType("multinomial")
val pipeline = new Pipeline().setStages(Array(tokenizer,remover,countVectorizer,nb));
val splits = mainDS.randomSplit(Array(0.80,0.20));
val train =splits(0);
//train.show(num,false);
val test = splits(1);
//test.show(num,false);
//mainDataset.show(100,false);
val model = pipeline.fit(train);
model.write.overwrite.save(savePath+"RandomForestClassifier");
//var model=rfc.fit(train);
var result = model.transform(test);
val predictionAndLabels = result.select("prediction", "label");
val evaluator = new MulticlassClassificationEvaluator().setMetricName("accuracy");
System.out.println("Accuracy = " + evaluator.evaluate(predictionAndLabels));
}
}
From: Jörn Franke [mailto:jornfranke@gmail.com]
Sent: Sunday, October 30, 2016 12:44 AM
To: Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> >
Cc: user@spark.apache.org <ma...@spark.apache.org>
Subject: Re: Out Of Memory issue
What is the size and format of the input data?
Can you provide more details on your Spark job? Rdd? Dataframe? Etc. Java Version? Is this a single node? It seems your executors and os do not get a lot of memory
On 29 Oct 2016, at 22:51, Kürşat Kurt <kursat@kursatkurt.com <ma...@kursatkurt.com> > wrote:
Hi;
While training NaiveBayes classification, i am getting OOM.
What is wrong with these parameters?
Here is the spark-submit command: ./spark-submit --class main.scala.Test1 --master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark 2.0.1.
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on 89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on 89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (1 time so far)
16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (2 times so far)
16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (3 times so far)
16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76 <ma...@43ab2e76>
16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID 31)
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-7,5,main]
java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig: Compression: SNAPPY
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet block size to 134217728
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet dictionary page size to 1048576
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Dictionary is on
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Validation is off
Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Writer version is: PARQUET_1_0
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem columnStore to file. allocated memory: 4,396,549
Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.ColumnChunkPageWriteStore: written 4,157,541B for [labels, list, element] BINARY: 142,207 values, 5,600,131B raw, 4,156,878B comp, 6 pages, encodings: [PLAIN, RLE]
16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1 times; aborting job
16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose tasks have all completed, from pool
16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at NaiveBayes.scala:400) failed in 570.233 s
16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at NaiveBayes.scala:400, took 934.966523 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:400)
at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:507)
at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:114)
at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:76)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:149)
at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:145)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableViewLike$Transformed$class.foreach(IterableViewLike.scala:44)
at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
at main.scala.Test1$.main(Test1.scala:172)
at main.scala.Test1.main(Test1.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.OutOfMemoryError: Java heap space
at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org <http://ExternalIterator.org> $apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at http://89.*************:4040
16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
16/10/29 23:41:28 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory /tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce
Re: Out Of Memory issue
Posted by Jörn Franke <jo...@gmail.com>.
What is the size and format of the input data?
Can you provide more details on your Spark job? Rdd? Dataframe? Etc. Java Version? Is this a single node? It seems your executors and os do not get a lot of memory
> On 29 Oct 2016, at 22:51, Kürşat Kurt <ku...@kursatkurt.com> wrote:
>
> Hi;
>
> While training NaiveBayes classification, i am getting OOM.
> What is wrong with these parameters?
> Here is the spark-submit command: ./spark-submit --class main.scala.Test1 --master local[*] --driver-memory 60g /home/user1/project_2.11-1.0.jar
>
> Ps: Os is Ubuntu 14.04 and system has 64GB RAM, 256GB SSD with spark 2.0.1.
>
> 16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece0 on 89.*************:35416 in memory (size: 4.0 MB, free: 31.7 GB)
> 16/10/29 23:32:21 INFO BlockManagerInfo: Removed broadcast_10_piece1 on 89.*************:35416 in memory (size: 2.4 MB, free: 31.7 GB)
> 16/10/29 23:33:00 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (1 time so far)
> 16/10/29 23:34:42 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (2 times so far)
> 16/10/29 23:36:58 INFO ExternalAppendOnlyMap: Thread 123 spilling in-memory map of 31.8 GB to disk (3 times so far)
> 16/10/29 23:41:27 WARN TaskMemoryManager: leak 21.2 GB memory from org.apache.spark.util.collection.ExternalAppendOnlyMap@43ab2e76
> 16/10/29 23:41:28 ERROR Executor: Exception in task 0.0 in stage 10.0 (TID 31)
> java.lang.OutOfMemoryError: Java heap space
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
> at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
> at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
> at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
> 16/10/29 23:41:28 ERROR SparkUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-7,5,main]
> java.lang.OutOfMemoryError: Java heap space
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
> at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
> at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
> at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.codec.CodecConfig: Compression: SNAPPY
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet block size to 134217728
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet page size to 1048576
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Parquet dictionary page size to 1048576
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Dictionary is on
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Validation is off
> Oct 29, 2016 11:25:48 PM INFO: org.apache.parquet.hadoop.ParquetOutputFormat: Writer version is: PARQUET_1_0
> Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.InternalParquetRecordWriter: Flushing mem columnStore to file. allocated memory: 4,396,549
> Oct 29, 2016 11:25:49 PM INFO: org.apache.parquet.hadoop.ColumnChunkPageWriteStore: written 4,157,541B for [labels, list, element] BINARY: 142,207 values, 5,600,131B raw, 4,156,878B comp, 6 pages, encodings: [PLAIN, RLE]
> 16/10/29 23:41:28 WARN TaskSetManager: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
> at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
> at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
> at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
>
> 16/10/29 23:41:28 INFO SparkContext: Invoking stop() from shutdown hook
> 16/10/29 23:41:28 ERROR TaskSetManager: Task 0 in stage 10.0 failed 1 times; aborting job
> 16/10/29 23:41:28 INFO TaskSchedulerImpl: Removed TaskSet 10.0, whose tasks have all completed, from pool
> 16/10/29 23:41:28 INFO TaskSchedulerImpl: Cancelling stage 10
> 16/10/29 23:41:28 INFO DAGScheduler: ResultStage 10 (collect at NaiveBayes.scala:400) failed in 570.233 s
> 16/10/29 23:41:28 INFO DAGScheduler: Job 5 failed: collect at NaiveBayes.scala:400, took 934.966523 s
> Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 1 times, most recent failure: Lost task 0.0 in stage 10.0 (TID 31, localhost): java.lang.OutOfMemoryError: Java heap space
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
> at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
> at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
> at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
>
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
> at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
> at scala.Option.foreach(Option.scala:257)
> at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
> at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
> at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
> at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
> at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:400)
> at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:507)
> at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:114)
> at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:76)
> at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
> at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
> at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:149)
> at org.apache.spark.ml.Pipeline$$anonfun$fit$2.apply(Pipeline.scala:145)
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
> at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
> at scala.collection.IterableViewLike$Transformed$class.foreach(IterableViewLike.scala:44)
> at scala.collection.SeqViewLike$AbstractTransformed.foreach(SeqViewLike.scala:37)
> at org.apache.spark.ml.Pipeline.fit(Pipeline.scala:145)
> at main.scala.Test1$.main(Test1.scala:172)
> at main.scala.Test1.main(Test1.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
> Caused by: java.lang.OutOfMemoryError: Java heap space
> at com.esotericsoftware.kryo.io.Input.readDoubles(Input.java:885)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:222)
> at com.esotericsoftware.kryo.serializers.DefaultArraySerializers$DoubleArraySerializer.read(DefaultArraySerializers.java:205)
> at com.esotericsoftware.kryo.Kryo.readObjectOrNull(Kryo.java:759)
> at com.esotericsoftware.kryo.serializers.ObjectField.read(ObjectField.java:132)
> at com.esotericsoftware.kryo.serializers.FieldSerializer.read(FieldSerializer.java:551)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:42)
> at com.twitter.chill.Tuple2Serializer.read(TupleSerializers.scala:33)
> at com.esotericsoftware.kryo.Kryo.readClassAndObject(Kryo.java:790)
> at org.apache.spark.serializer.KryoDeserializationStream.readObject(KryoSerializer.scala:229)
> at org.apache.spark.serializer.DeserializationStream.readValue(Serializer.scala:159)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.readNextItem(ExternalAppendOnlyMap.scala:515)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$DiskMapIterator.hasNext(ExternalAppendOnlyMap.scala:535)
> at scala.collection.Iterator$$anon$1.hasNext(Iterator.scala:1004)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.org$apache$spark$util$collection$ExternalAppendOnlyMap$ExternalIterator$$readNextHashCode(ExternalAppendOnlyMap.scala:336)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:409)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator$$anonfun$next$1.apply(ExternalAppendOnlyMap.scala:407)
> at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:407)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.next(ExternalAppendOnlyMap.scala:302)
> at scala.collection.Iterator$class.foreach(Iterator.scala:893)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.foreach(ExternalAppendOnlyMap.scala:302)
> at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
> at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
> at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.to(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
> at org.apache.spark.util.collection.ExternalAppendOnlyMap$ExternalIterator.toBuffer(ExternalAppendOnlyMap.scala:302)
> at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
> 16/10/29 23:41:28 INFO SparkUI: Stopped Spark web UI at http://89.*************:4040
> 16/10/29 23:41:28 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
> 16/10/29 23:41:28 INFO MemoryStore: MemoryStore cleared
> 16/10/29 23:41:28 INFO BlockManager: BlockManager stopped
> 16/10/29 23:41:28 INFO BlockManagerMaster: BlockManagerMaster stopped
> 16/10/29 23:41:28 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
> 16/10/29 23:41:28 INFO SparkContext: Successfully stopped SparkContext
> 16/10/29 23:41:28 INFO ShutdownHookManager: Shutdown hook called
> 16/10/29 23:41:28 INFO ShutdownHookManager: Deleting directory /tmp/spark-15cf14e4-f103-4cbf-aa0f-85828eadbcce