You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Msr Msr <ms...@gmail.com> on 2016/02/18 01:12:32 UTC
pyspark take function error while count() and collect() are working fine
Hi,
Please help with below issue:
I am reading a large file and trying to display first few lines. But it is
able to only display take(1) data.
if it is take(2) or more then giving below error message. It is the error
with Take function. Both count() and collect() are working fine.
The input file is created on MAC and we are running the spark on windows 64
computer.
The same data file spark process runs perfectly on MAC. I have verified the
file carriage return and line feed chars. They look good.
spark: spark-1.5.2
python: 2.7
---
Test file:
<
http://apache-spark-user-list.1001560.n3.nabble.com/file/n26254/testdata.txt
>
---
Error Stack trace:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-73-35940a5bae89> in <module>()
1 data1 = sc.textFile("testdata.txt")
----> 2 data1.take(2)
D:\spark-1.5.2-bin-hadoop2.6\spark-1.5.2-bin-hadoop2.6\python\pyspark\rdd.pyc
in take(self, num)
1297
1298 p = range(partsScanned, min(partsScanned +
numPartsToTry, totalParts))
-> 1299 res = self.context.runJob(self, takeUpToNumLeft, p)
1300
1301 items += res
D:\spark-1.5.2-bin-hadoop2.6\spark-1.5.2-bin-hadoop2.6\python\pyspark\context.pyc
in runJob(self, rdd, partitionFunc, partitions, allowLocal)
914 # SparkContext#runJob.
915 mappedRDD = rdd.mapPartitions(partitionFunc)
--> 916 port = self._jvm.PythonRDD.runJob(self._jsc.sc(),
mappedRDD._jrdd, partitions)
917 return list(_load_from_socket(port,
mappedRDD._jrdd_deserializer))
918
D:\Program Files (x86)\anaconda27\lib\site-packages\py4j\java_gateway.pyc
in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
D:\Program Files (x86)\anaconda27\lib\site-packages\py4j\protocol.pyc in
get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling
z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 81.0 failed 1 times, most recent failure: Lost task 0.0 in stage
81.0 (TID 108, localhost): java.net.SocketException: Software caused
connection abort: recv failed
at java.net.SocketInputStream.socketRead0(Native Method)
at
java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:170)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at java.io.DataInputStream.readFully(DataInputStream.java:195)
at java.io.DataInputStream.readFully(DataInputStream.java:169)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:142)
at
org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:129)
at
org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:125)
at
org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at
org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to
(TraversableOnce.scala:273)
at org.apache.spark.InterruptibleIterator.to
(InterruptibleIterator.scala:28)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at
org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at
org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at
org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:393)
at
org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:393)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1850)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1850)
at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1850)
at
org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:393)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.GeneratedMethodAccessor47.invoke(Unknown Source)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.SocketException: Software caused connection abort: recv
failed
at java.net.SocketInputStream.socketRead0(Native Method)
at
java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:170)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at java.io.DataInputStream.readFully(DataInputStream.java:195)
at java.io.DataInputStream.readFully(DataInputStream.java:169)
at
org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:142)
at
org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:129)
at
org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:125)
at
org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at
org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to
(TraversableOnce.scala:273)
at org.apache.spark.InterruptibleIterator.to
(InterruptibleIterator.scala:28)
at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at
org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at
org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at
org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:393)
at
org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:393)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1850)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1850)
at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more