You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Huseyin Elci (Jira)" <ji...@apache.org> on 2021/02/04 03:38:00 UTC

[jira] [Created] (SPARK-34351) Running into "Py4JJavaError" while counting to text file or list using Pyspark, Jupyter notebook

Huseyin Elci created SPARK-34351:
------------------------------------

             Summary: Running into "Py4JJavaError" while counting to text file or list using Pyspark, Jupyter notebook
                 Key: SPARK-34351
                 URL: https://issues.apache.org/jira/browse/SPARK-34351
             Project: Spark
          Issue Type: Bug
          Components: PySpark
    Affects Versions: 2.3.1
         Environment: PS> python --version
*Python 3.6.8*


PS> jupyter --version
j*upyter core : 4.7.0*
*jupyter-notebook : 6.2.0*
qtconsole : 5.0.2
ipython : 7.16.1
ipykernel : 5.4.3
jupyter client : 6.1.11
jupyter lab : not installed
nbconvert : 6.0.7
ipywidgets : 7.6.3
nbformat : 5.1.2
traitlets : 4.3.3


PS > java -version
*java version "1.8.0_271"*
Java(TM) SE Runtime Environment (build 1.8.0_271-b09)
Java HotSpot(TM) 64-Bit Server VM (build 25.271-b09, mixed mode)

 

Spark versiyon

*spark-2.3.1-bin-hadoop2.7*
!chrome-extension://bpggmmljdiliancllaapiggllnkbjocb/logo/16.png!
            Reporter: Huseyin Elci


I run into the following error: 
 Any help resolving this error is greatly appreciated.
 *My Code 1:*
{code:python}
import findspark
findspark.init("C:\Spark")

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

spark = SparkSession.builder\
        .master("local[4]")\
        .appName("WordCount_RDD")\
        .getOrCreate()
sc = spark.sparkContext

data = "D:\\05 Spark\\data\\MyArticle.txt"
story_rdd = sc.textFile(data)
story_rdd.count()

{code}
*My Code 2:* 
{code:python}
import findspark
findspark.init("C:\Spark")
from pyspark import SparkContext

sc = SparkContext()

mylist = [1,2,2,3,5,48,98,62,14,55]
mylist_rdd = sc.parallelize(mylist)
mylist_rdd.map(lambda x: x*x)
mylist_rdd.map(lambda x: x*x).collect()
{code}
*ERROR:*

I took same error code for my codes.
{code:python}
 ---------------------------------------------------------------------------
 Py4JJavaError Traceback (most recent call last)
 <ipython-input-9-1af9abd2340f> in <module>
 ----> 1 story_rdd.count()

C:\Spark\python\pyspark\rdd.py in count(self)
 1071 3
 1072 """
 -> 1073 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
 1074 
 1075 def stats(self):

C:\Spark\python\pyspark\rdd.py in sum(self)
 1062 6.0
 1063 """
 -> 1064 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
 1065 
 1066 def count(self):

C:\Spark\python\pyspark\rdd.py in fold(self, zeroValue, op)
 933 # zeroValue provided to each partition is unique from the one provided
 934 # to the final reduce call
 --> 935 vals = self.mapPartitions(func).collect()
 936 return reduce(op, vals, zeroValue)
 937

C:\Spark\python\pyspark\rdd.py in collect(self)
 832 """
 833 with SCCallSiteSync(self.context) as css:
 --> 834 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
 835 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
 836

C:\Spark\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py in __call__(self, *args)
 1255 answer = self.gateway_client.send_command(command)
 1256 return_value = get_return_value(
 -> 1257 answer, self.gateway_client, self.target_id, self.name)
 1258 
 1259 for temp_arg in temp_args:

C:\Spark\python\pyspark\sql\utils.py in deco(*a, **kw)
 61 def deco(*a, **kw):
 62 try:
 ---> 63 return f(*a, **kw)
 64 except py4j.protocol.Py4JJavaError as e:
 65 s = e.java_exception.toString()

C:\Spark\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
 326 raise Py4JJavaError(
 327 "An error occurred while calling

{0} \{1} \{2}

.\n".
 --> 328 format(target_id, ".", name), value)
 329 else:
 330 raise Py4JError(

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
 : org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
 at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:148)
 at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:76)
 at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
 at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:86)
 at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:64)
 at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
 at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
 at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
 at org.apache.spark.scheduler.Task.run(Task.scala:109)
 at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
 at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
 at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
 at java.lang.Thread.run(Thread.java:748)
 Caused by: java.net.SocketTimeoutException: Accept timed out
 at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
 at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
 at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
 at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
 at java.net.ServerSocket.implAccept(ServerSocket.java:545)
 at java.net.ServerSocket.accept(ServerSocket.java:513)
 at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:142)
 ... 12 more

Driver stacktrace:
 at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
 at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
 at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
 at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
 at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
 at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
 at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
 at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
 at scala.Option.foreach(Option.scala:257)
 at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
 at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
 at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
 at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
 at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
 at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
 at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
 at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
 at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
 at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
 at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
 at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
 at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
 at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
 at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
 at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:162)
 at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
 at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
 at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
 at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.lang.reflect.Method.invoke(Method.java:498)
 at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
 at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
 at py4j.Gateway.invoke(Gateway.java:282)
 at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
 at py4j.commands.CallCommand.execute(CallCommand.java:79)
 at py4j.GatewayConnection.run(GatewayConnection.java:238)
 at java.lang.Thread.run(Thread.java:748)
 Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
 at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:148)
 at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:76)
 at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117)
 at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:86)
 at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:64)
 at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
 at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
 at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
 at org.apache.spark.scheduler.Task.run(Task.scala:109)
 at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
 at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
 at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
 ... 1 more
 Caused by: java.net.SocketTimeoutException: Accept timed out
 at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
 at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
 at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
 at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
 at java.net.ServerSocket.implAccept(ServerSocket.java:545)
 at java.net.ServerSocket.accept(ServerSocket.java:513)
 at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:142)
 ... 12 more

{code}
!chrome-extension://bpggmmljdiliancllaapiggllnkbjocb/img/audio.png!



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org