You are viewing a plain text version of this content. The canonical link for it is here.

Posted to user@spark.apache.org by Gal Benshlomo <ga...@startapp.com> on 2019/11/10 15:31:10 UTC

RE: PySpark Pandas UDF

Hi,

I'm using pandas_udf and not able to run it from cluster mode, even though the same code works on standalone.

The code is as follows:




schema_test = StructType([
    StructField("cluster", LongType()),
    StructField("name", StringType())
])


@pandas_udf(schema_test, PandasUDFType.GROUPED_MAP)
def test_foo(pd_df):
    print('\n\nSid is problematic\n\n')
    pd_df['cluster'] = 1
    return pd_df[['name', 'cluster']]


department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
users_data = spark.createDataFrame([department1, department2])
res = users_data.groupby('id').apply(test_foo)

res.write.parquet(RES_OUTPUT_PATH, mode='overwrite')


the errors I'm getting are:
ERROR FileFormatWriter: Aborting job c6eefb8c-c8d5-4236-82d7-298924b03b25.
org.apache.spark.SparkException: Job aborted due to stage failure: Task 81 in stage 1.0 failed 4 times, most recent failure: Lost task 81.3 in stage 1.0 (TID 192, 10.10.1.17, executor 1): java.lang.IllegalArgumentException
at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
at org.apache.arrow.vector.ipc.message.MessageChannelReader.readNext(MessageChannelReader.java:58)
at org.apache.arrow.vector.ipc.ArrowStreamReader.readSchema(ArrowStreamReader.java:132)
at org.apache.arrow.vector.ipc.ArrowReader.initialize(ArrowReader.java:181)
at org.apache.arrow.vector.ipc.ArrowReader.ensureInitialized(ArrowReader.java:172)
at org.apache.arrow.vector.ipc.ArrowReader.getVectorSchemaRoot(ArrowReader.java:65)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:162)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:232)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:557)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException
at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
at org.apache.arrow.vector.ipc.message.MessageChannelReader.readNext(MessageChannelReader.java:58)
at org.apache.arrow.vector.ipc.ArrowStreamReader.readSchema(ArrowStreamReader.java:132)
at org.apache.arrow.vector.ipc.ArrowReader.initialize(ArrowReader.java:181)
at org.apache.arrow.vector.ipc.ArrowReader.ensureInitialized(ArrowReader.java:172)
at org.apache.arrow.vector.ipc.ArrowReader.getVectorSchemaRoot(ArrowReader.java:65)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:162)
at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:232)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
19/11/10 15:10:32 WARN TaskSetManager: Lost task 41.3 in stage 1.0 (TID 209, 10.10.1.17, executor 1): TaskKilled (Stage cancelled)
19/11/10 15:10:32 WARN TaskSetManager: Lost task 0.0 in stage 1.0 (TID 26, 10.10.1.17, executor 1): TaskKilled (Stage cancelled)
19/11/10 15:10:32 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks have all completed, from pool
Traceback (most recent call last):
File "/opt/spark/work-dir/poi-attribution-engine/dev/stdbscanMain.py", line 203, in <module>
main()
File "/opt/spark/work-dir/poi-attribution-engine/dev/stdbscanMain.py", line 199, in main
visits.write.parquet(VISITS_OUTPUT_PATH, mode='overwrite')
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 841, in parquet
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o290.parquet.



Running on Kubernetees.

Thanks in advance!


Gal Benshlomo / Data Science Team Leader
M: +972-52-2252105 / Skype: gal.ben.shlomo

[emailsignature]

Re: PySpark Pandas UDF

Posted by Gourav Sengupta <go...@gmail.com>.

Hi,

sorry a completely unrelated question.

when is the upcoming release of SPARK 3.0. There are several parallel
distributed deep learning frameworks that are being developed, do you think
that we could use SPARK 3.0 for distributed deep learning using Pytorch or
Tensorflow?

Is there any place where we can find its details?

Regards,
Gourav Sengupta

On Mon, Nov 18, 2019 at 7:09 AM Bryan Cutler <cu...@gmail.com> wrote:

> There was a change in the binary format of Arrow 0.15.1 and there is an
> environment variable you can set to make pyarrow 0.15.1 compatible with
> current Spark, which looks to be your problem. Please see the doc below for
> instructions added in SPARK-2936. Note, this will not be required for the
> upcoming release of Spark 3.0.0.
>
> https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibiliy-setting-for-pyarrow--0150-and-spark-23x-24x
>
> On Tue, Nov 12, 2019 at 7:53 AM Holden Karau <ho...@pigscanfly.ca> wrote:
>
>> Thanks for sharing that. I think we should maybe add some checks around
>> this so it’s easier to debug. I’m CCing Bryan who might have some thoughts.
>>
>> On Tue, Nov 12, 2019 at 7:42 AM gal.benshlomo <ga...@startapp.com>
>> wrote:
>>
>>> SOLVED!
>>> thanks for the help - I found the issue. it was the version of pyarrow
>>> (0.15.1) which apparently isn't currently stable. Downgrading it solved
>>> the
>>> issue for me
>>>
>>>
>>>
>>> --
>>> Sent from: http://apache-spark-user-list.1001560.n3.nabble.com/
>>>
>>> ---------------------------------------------------------------------
>>> To unsubscribe e-mail: user-unsubscribe@spark.apache.org
>>>
>>> --
>> Twitter: https://twitter.com/holdenkarau
>> Books (Learning Spark, High Performance Spark, etc.):
>> https://amzn.to/2MaRAG9  <https://amzn.to/2MaRAG9>
>> YouTube Live Streams: https://www.youtube.com/user/holdenkarau
>>
>

Re: PySpark Pandas UDF

Posted by Bryan Cutler <cu...@gmail.com>.

There was a change in the binary format of Arrow 0.15.1 and there is an
environment variable you can set to make pyarrow 0.15.1 compatible with
current Spark, which looks to be your problem. Please see the doc below for
instructions added in SPARK-2936. Note, this will not be required for the
upcoming release of Spark 3.0.0.
https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibiliy-setting-for-pyarrow--0150-and-spark-23x-24x

On Tue, Nov 12, 2019 at 7:53 AM Holden Karau <ho...@pigscanfly.ca> wrote:

> Thanks for sharing that. I think we should maybe add some checks around
> this so it’s easier to debug. I’m CCing Bryan who might have some thoughts.
>
> On Tue, Nov 12, 2019 at 7:42 AM gal.benshlomo <ga...@startapp.com>
> wrote:
>
>> SOLVED!
>> thanks for the help - I found the issue. it was the version of pyarrow
>> (0.15.1) which apparently isn't currently stable. Downgrading it solved
>> the
>> issue for me
>>
>>
>>
>> --
>> Sent from: http://apache-spark-user-list.1001560.n3.nabble.com/
>>
>> ---------------------------------------------------------------------
>> To unsubscribe e-mail: user-unsubscribe@spark.apache.org
>>
>> --
> Twitter: https://twitter.com/holdenkarau
> Books (Learning Spark, High Performance Spark, etc.):
> https://amzn.to/2MaRAG9  <https://amzn.to/2MaRAG9>
> YouTube Live Streams: https://www.youtube.com/user/holdenkarau
>

Re: PySpark Pandas UDF

Posted by Holden Karau <ho...@pigscanfly.ca>.

Thanks for sharing that. I think we should maybe add some checks around
this so it’s easier to debug. I’m CCing Bryan who might have some thoughts.

On Tue, Nov 12, 2019 at 7:42 AM gal.benshlomo <ga...@startapp.com>
wrote:

> SOLVED!
> thanks for the help - I found the issue. it was the version of pyarrow
> (0.15.1) which apparently isn't currently stable. Downgrading it solved the
> issue for me
>
>
>
> --
> Sent from: http://apache-spark-user-list.1001560.n3.nabble.com/
>
> ---------------------------------------------------------------------
> To unsubscribe e-mail: user-unsubscribe@spark.apache.org
>
> --
Twitter: https://twitter.com/holdenkarau
Books (Learning Spark, High Performance Spark, etc.):
https://amzn.to/2MaRAG9  <https://amzn.to/2MaRAG9>
YouTube Live Streams: https://www.youtube.com/user/holdenkarau

RE: PySpark Pandas UDF

Posted by "gal.benshlomo" <ga...@startapp.com>.

SOLVED!
thanks for the help - I found the issue. it was the version of pyarrow
(0.15.1) which apparently isn't currently stable. Downgrading it solved the
issue for me



--
Sent from: http://apache-spark-user-list.1001560.n3.nabble.com/

---------------------------------------------------------------------
To unsubscribe e-mail: user-unsubscribe@spark.apache.org

Re: PySpark Pandas UDF

Posted by "gal.benshlomo" <ga...@startapp.com>.

Hi, 
Thanks for your reply.
Tried what you've suggested and still getting the same error.
Also worth mentioning that when I tried to simply write the dataframe to S3,
without applying the function, it works. 



--
Sent from: http://apache-spark-user-list.1001560.n3.nabble.com/

---------------------------------------------------------------------
To unsubscribe e-mail: user-unsubscribe@spark.apache.org

Re: PySpark Pandas UDF

Posted by Holden Karau <ho...@pigscanfly.ca>.

Can you switch the write for a count just so we can isolate if it’s the
write or the count?
Also what’s the output path your using?

On Sun, Nov 10, 2019 at 7:31 AM Gal Benshlomo <ga...@startapp.com>
wrote:

>
>
> Hi,
>
>
>
> I’m using pandas_udf and not able to run it from cluster mode, even though
> the same code works on standalone.
>
>
>
> The code is as follows:
>
>
>
>
>
>
>
> schema_test = StructType([
>     StructField("cluster", LongType()),
>     StructField("name", StringType())
> ])
>
>
> @pandas_udf(schema_test, PandasUDFType.GROUPED_MAP)
> def test_foo(pd_df):
>     print('\n\nSid is problematic\n\n')
>     pd_df['cluster'] = 1
>     return pd_df[['name', 'cluster']]
>
>
>
>
>
> department1 = Row(id='123456', name='Computer Science')
> department2 = Row(id='789012', name='Mechanical Engineering')
> users_data = spark.createDataFrame([department1, department2])
> res = users_data.groupby('id').apply(test_foo)
>
> res.write.parquet(RES_OUTPUT_PATH, mode='overwrite')
>
>
>
>
>
> the errors I’m getting are:
>
> ERROR FileFormatWriter: Aborting job c6eefb8c-c8d5-4236-82d7-298924b03b25.
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 81
> in stage 1.0 failed 4 times, most recent failure: Lost task 81.3 in stage
> 1.0 (TID 192, 10.10.1.17, executor 1): java.lang.IllegalArgumentException
> at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
> at
> org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
> at
> org.apache.arrow.vector.ipc.message.MessageChannelReader.readNext(MessageChannelReader.java:58)
> at
> org.apache.arrow.vector.ipc.ArrowStreamReader.readSchema(ArrowStreamReader.java:132)
> at org.apache.arrow.vector.ipc.ArrowReader.initialize(ArrowReader.java:181)
> at
> org.apache.arrow.vector.ipc.ArrowReader.ensureInitialized(ArrowReader.java:172)
> at
> org.apache.arrow.vector.ipc.ArrowReader.getVectorSchemaRoot(ArrowReader.java:65)
> at
> org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:162)
> at
> org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
> at
> org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:232)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> at org.apache.spark.scheduler.Task.run(Task.scala:121)
> at
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
>
> Driver stacktrace:
> at org.apache.spark.scheduler.DAGScheduler.org
> $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
> at scala.Option.foreach(Option.scala:257)
> at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
> at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
> at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:167)
> at
> org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:159)
> at
> org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
> at
> org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
> at
> org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at
> org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
> at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
> at
> org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
> at
> org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
> at
> org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
> at
> org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:668)
> at
> org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
> at
> org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
> at
> org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
> at
> org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
> at
> org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
> at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
> at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:228)
> at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:557)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
> at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
> at py4j.Gateway.invoke(Gateway.java:282)
> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
> at py4j.commands.CallCommand.execute(CallCommand.java:79)
> at py4j.GatewayConnection.run(GatewayConnection.java:238)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: java.lang.IllegalArgumentException
> at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
> at
> org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
> at
> org.apache.arrow.vector.ipc.message.MessageChannelReader.readNext(MessageChannelReader.java:58)
> at
> org.apache.arrow.vector.ipc.ArrowStreamReader.readSchema(ArrowStreamReader.java:132)
> at org.apache.arrow.vector.ipc.ArrowReader.initialize(ArrowReader.java:181)
> at
> org.apache.arrow.vector.ipc.ArrowReader.ensureInitialized(ArrowReader.java:172)
> at
> org.apache.arrow.vector.ipc.ArrowReader.getVectorSchemaRoot(ArrowReader.java:65)
> at
> org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:162)
> at
> org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:122)
> at
> org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
> at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:232)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:170)
> at
> org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:169)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
> at org.apache.spark.scheduler.Task.run(Task.scala:121)
> at
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:403)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:409)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> ... 1 more
> 19/11/10 15:10:32 WARN TaskSetManager: Lost task 41.3 in stage 1.0 (TID
> 209, 10.10.1.17, executor 1): TaskKilled (Stage cancelled)
> 19/11/10 15:10:32 WARN TaskSetManager: Lost task 0.0 in stage 1.0 (TID 26,
> 10.10.1.17, executor 1): TaskKilled (Stage cancelled)
> 19/11/10 15:10:32 INFO TaskSchedulerImpl: Removed TaskSet 1.0, whose tasks
> have all completed, from pool
> Traceback (most recent call last):
> File "/opt/spark/work-dir/poi-attribution-engine/dev/stdbscanMain.py",
> line 203, in <module>
> main()
> File "/opt/spark/work-dir/poi-attribution-engine/dev/stdbscanMain.py",
> line 199, in main
> visits.write.parquet(VISITS_OUTPUT_PATH, mode='overwrite')
> File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line
> 841, in parquet
> File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py",
> line 1257, in __call__
> File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in
> deco
> File "/opt/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line
> 328, in get_return_value
> py4j.protocol.Py4JJavaError: An error occurred while calling o290.parquet.
>
>
>
>
>
>
>
> Running on Kubernetees.
>
>
>
> Thanks in advance!
>
>
>
>
>
> Gal Benshlomo */* Data Science Team Leader
>
> M: +972-52-2252105 */* Skype: gal.ben.shlomo
>
>
>
> [image: emailsignature]
>
>
>
-- 
Twitter: https://twitter.com/holdenkarau
Books (Learning Spark, High Performance Spark, etc.):
https://amzn.to/2MaRAG9  <https://amzn.to/2MaRAG9>
YouTube Live Streams: https://www.youtube.com/user/holdenkarau