You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Dirceu Semighini Filho <di...@gmail.com> on 2017/06/13 18:02:18 UTC

Read Local File

Hi all,
I'm trying to read a File from local filesystem, I'd 4 workstations 1
Master and 3 slaves, running with Ambari and Yarn with Spark version*
2.1.1.2.6.1.0-129*

The code that I'm trying to run is quite simple

spark.sqlContext.read.text("file:///pathToFile").count

I've copied the file in all 4 workstations and every time that I try to run
this I got the following exception:
17/06/13 17:57:37 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 12,
ip, executor 1): java.io.FileNotFoundException: File file:pathToFile does
not exist
It is possible the underlying files have been updated. You can explicitly
invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
in SQL or by recreating the Dataset/DataFrame involved.
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:175)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown
Source)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:126)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)

17/06/13 17:57:37 ERROR TaskSetManager: Task 0 in stage 6.0 failed 4 times;
aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage
6.0 (TID 15, ip, executor 1): java.io.FileNotFoundException: File
file:file:pathToFile does not exist
It is possible the underlying files have been updated. You can explicitly
invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
in SQL or by recreating the Dataset/DataFrame involved.
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:175)
at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown
Source)
at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
Source)
at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:126)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
  at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
  at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
  at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
  at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
  at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
  at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
  at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
  at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
  at scala.Option.foreach(Option.scala:257)
  at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
  at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
  at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
  at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
  at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
  at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
  at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
  at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
  at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
  at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
  at
org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:275)
  at
org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2386)
  at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
  at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788)
  at org.apache.spark.sql.Dataset.org
$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2385)
  at org.apache.spark.sql.Dataset.org
$apache$spark$sql$Dataset$$collect(Dataset.scala:2392)
  at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2420)
  at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2419)
  at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2801)
  at org.apache.spark.sql.Dataset.count(Dataset.scala:2419)
  ... 48 elided
Caused by: java.io.FileNotFoundException: File file:file:pathToFile does
not exist
It is possible the underlying files have been updated. You can explicitly
invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
in SQL or by recreating the Dataset/DataFrame involved.
  at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:175)
  at
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
  at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown
Source)
  at
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown
Source)
  at
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
  at
org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
  at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
  at
org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:126)
  at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
  at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
  at org.apache.spark.scheduler.Task.run(Task.scala:99)
  at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
  at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
  at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
  at java.lang.Thread.run(Thread.java:745)

Can somebody help me on fixing this?

Kind Regards,
Dirceu

Re: Read Local File

Posted by Dirceu Semighini Filho <di...@gmail.com>.
Hello Satish,
Thanks for your answer
*I guess you have already made sure that the paths for your file are
exactly the same on each of your nodes*
Yes
* I'd also check the perms on your path*
The files are all with the same permissions
*Believe the sample code you pasted is only for testing - and you are
already aware that a distributed count on a local file has no benefits.*
Have done this to force the file read.
spark.sqlContext.read.text("file:///pathToFile").count

* Typing the path explicitly resolved*
I'll try this, I'm not sure if I've tested the varargs path option.
*Alternately - if the file size is small, you could do spark-submit with a
--files option which will ship the file to every executor and is available
for all executors. *
Unfortunately I can't do this, because it's a continuous reading process

Thanks.



2017-06-14 4:35 GMT-03:00 satish lalam <sa...@gmail.com>:

> I guess you have already made sure that the paths for your file are
> exactly the same on each of your nodes. I'd also check the perms on your
> path.
> Believe the sample code you pasted is only for testing - and you are
> already aware that a distributed count on a local file has no benefits.
> Once I ran into a similar issue while copy pasting file paths probably due
> to encoding issues on some text editors. I'd copied a hidden char at the
> end of the path from source file which made my file lookup fail, but the
> code looked perfectly alright. Typing the path explicitly resolved it. But
> this is a corner case.
>
> Alternately - if the file size is small, you could do spark-submit with a
> --files option which will ship the file to every executor and is available
> for all executors.
>
>
>
>
> On Tue, Jun 13, 2017 at 11:02 AM, Dirceu Semighini Filho <
> dirceu.semighini@gmail.com> wrote:
>
>> Hi all,
>> I'm trying to read a File from local filesystem, I'd 4 workstations 1
>> Master and 3 slaves, running with Ambari and Yarn with Spark version*
>> 2.1.1.2.6.1.0-129*
>>
>> The code that I'm trying to run is quite simple
>>
>> spark.sqlContext.read.text("file:///pathToFile").count
>>
>> I've copied the file in all 4 workstations and every time that I try to
>> run this I got the following exception:
>> 17/06/13 17:57:37 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID
>> 12, ip, executor 1): java.io.FileNotFoundException: File file:pathToFile
>> does not exist
>> It is possible the underlying files have been updated. You can explicitly
>> invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
>> in SQL or by recreating the Dataset/DataFrame involved.
>> at org.apache.spark.sql.execution.datasources.FileScanRDD$$
>> anon$1.nextIterator(FileScanRDD.scala:175)
>> at org.apache.spark.sql.execution.datasources.FileScanRDD$$
>> anon$1.hasNext(FileScanRDD.scala:109)
>> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$Gen
>> eratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
>> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$Gen
>> eratedIterator.processNext(Unknown Source)
>> at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(
>> BufferedRowIterator.java:43)
>> at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfu
>> n$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
>> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>> at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.w
>> rite(BypassMergeSortShuffleWriter.java:126)
>> at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMap
>> Task.scala:96)
>> at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMap
>> Task.scala:53)
>> at org.apache.spark.scheduler.Task.run(Task.scala:99)
>> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPool
>> Executor.java:1142)
>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoo
>> lExecutor.java:617)
>> at java.lang.Thread.run(Thread.java:745)
>>
>> 17/06/13 17:57:37 ERROR TaskSetManager: Task 0 in stage 6.0 failed 4
>> times; aborting job
>> org.apache.spark.SparkException: Job aborted due to stage failure: Task
>> 0 in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage
>> 6.0 (TID 15, ip, executor 1): java.io.FileNotFoundException: File
>> file:file:pathToFile does not exist
>> It is possible the underlying files have been updated. You can explicitly
>> invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
>> in SQL or by recreating the Dataset/DataFrame involved.
>> at org.apache.spark.sql.execution.datasources.FileScanRDD$$
>> anon$1.nextIterator(FileScanRDD.scala:175)
>> at org.apache.spark.sql.execution.datasources.FileScanRDD$$
>> anon$1.hasNext(FileScanRDD.scala:109)
>> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$Gen
>> eratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
>> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$Gen
>> eratedIterator.processNext(Unknown Source)
>> at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(
>> BufferedRowIterator.java:43)
>> at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfu
>> n$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
>> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>> at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.w
>> rite(BypassMergeSortShuffleWriter.java:126)
>> at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMap
>> Task.scala:96)
>> at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMap
>> Task.scala:53)
>> at org.apache.spark.scheduler.Task.run(Task.scala:99)
>> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
>> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPool
>> Executor.java:1142)
>> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoo
>> lExecutor.java:617)
>> at java.lang.Thread.run(Thread.java:745)
>>
>> Driver stacktrace:
>>   at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$sch
>> eduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
>>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$
>> 1.apply(DAGScheduler.scala:1423)
>>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$
>> 1.apply(DAGScheduler.scala:1422)
>>   at scala.collection.mutable.ResizableArray$class.foreach(Resiza
>> bleArray.scala:59)
>>   at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
>>   at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGSchedu
>> ler.scala:1422)
>>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskS
>> etFailed$1.apply(DAGScheduler.scala:802)
>>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskS
>> etFailed$1.apply(DAGScheduler.scala:802)
>>   at scala.Option.foreach(Option.scala:257)
>>   at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(
>> DAGScheduler.scala:802)
>>   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOn
>> Receive(DAGScheduler.scala:1650)
>>   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onRe
>> ceive(DAGScheduler.scala:1605)
>>   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onRe
>> ceive(DAGScheduler.scala:1594)
>>   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>>   at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.
>> scala:628)
>>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
>>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
>>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
>>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
>>   at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
>>   at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperati
>> onScope.scala:151)
>>   at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperati
>> onScope.scala:112)
>>   at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
>>   at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
>>   at org.apache.spark.sql.execution.SparkPlan.executeCollect(
>> SparkPlan.scala:275)
>>   at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$D
>> ataset$$execute$1$1.apply(Dataset.scala:2386)
>>   at org.apache.spark.sql.execution.SQLExecution$.withNewExecutio
>> nId(SQLExecution.scala:57)
>>   at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788)
>>   at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$e
>> xecute$1(Dataset.scala:2385)
>>   at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$c
>> ollect(Dataset.scala:2392)
>>   at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.
>> scala:2420)
>>   at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.
>> scala:2419)
>>   at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2801)
>>   at org.apache.spark.sql.Dataset.count(Dataset.scala:2419)
>>   ... 48 elided
>> Caused by: java.io.FileNotFoundException: File file:file:pathToFile does
>> not exist
>> It is possible the underlying files have been updated. You can explicitly
>> invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
>> in SQL or by recreating the Dataset/DataFrame involved.
>>   at org.apache.spark.sql.execution.datasources.FileScanRDD$$
>> anon$1.nextIterator(FileScanRDD.scala:175)
>>   at org.apache.spark.sql.execution.datasources.FileScanRDD$$
>> anon$1.hasNext(FileScanRDD.scala:109)
>>   at org.apache.spark.sql.catalyst.expressions.GeneratedClass$Gen
>> eratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
>>   at org.apache.spark.sql.catalyst.expressions.GeneratedClass$Gen
>> eratedIterator.processNext(Unknown Source)
>>   at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(
>> BufferedRowIterator.java:43)
>>   at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfu
>> n$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
>>   at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>>   at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.w
>> rite(BypassMergeSortShuffleWriter.java:126)
>>   at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMap
>> Task.scala:96)
>>   at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMap
>> Task.scala:53)
>>   at org.apache.spark.scheduler.Task.run(Task.scala:99)
>>   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.
>> scala:322)
>>   at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPool
>> Executor.java:1142)
>>   at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoo
>> lExecutor.java:617)
>>   at java.lang.Thread.run(Thread.java:745)
>>
>> Can somebody help me on fixing this?
>>
>> Kind Regards,
>> Dirceu
>>
>
>

Re: Read Local File

Posted by satish lalam <sa...@gmail.com>.
I guess you have already made sure that the paths for your file are exactly
the same on each of your nodes. I'd also check the perms on your path.
Believe the sample code you pasted is only for testing - and you are
already aware that a distributed count on a local file has no benefits.
Once I ran into a similar issue while copy pasting file paths probably due
to encoding issues on some text editors. I'd copied a hidden char at the
end of the path from source file which made my file lookup fail, but the
code looked perfectly alright. Typing the path explicitly resolved it. But
this is a corner case.

Alternately - if the file size is small, you could do spark-submit with a
--files option which will ship the file to every executor and is available
for all executors.




On Tue, Jun 13, 2017 at 11:02 AM, Dirceu Semighini Filho <
dirceu.semighini@gmail.com> wrote:

> Hi all,
> I'm trying to read a File from local filesystem, I'd 4 workstations 1
> Master and 3 slaves, running with Ambari and Yarn with Spark version*
> 2.1.1.2.6.1.0-129*
>
> The code that I'm trying to run is quite simple
>
> spark.sqlContext.read.text("file:///pathToFile").count
>
> I've copied the file in all 4 workstations and every time that I try to
> run this I got the following exception:
> 17/06/13 17:57:37 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 12,
> ip, executor 1): java.io.FileNotFoundException: File file:pathToFile does
> not exist
> It is possible the underlying files have been updated. You can explicitly
> invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
> in SQL or by recreating the Dataset/DataFrame involved.
> at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.
> nextIterator(FileScanRDD.scala:175)
> at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(
> FileScanRDD.scala:109)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$
> GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$
> GeneratedIterator.processNext(Unknown Source)
> at org.apache.spark.sql.execution.BufferedRowIterator.
> hasNext(BufferedRowIterator.java:43)
> at org.apache.spark.sql.execution.WholeStageCodegenExec$$
> anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(
> BypassMergeSortShuffleWriter.java:126)
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:96)
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:53)
> at org.apache.spark.scheduler.Task.run(Task.scala:99)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1142)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
>
> 17/06/13 17:57:37 ERROR TaskSetManager: Task 0 in stage 6.0 failed 4
> times; aborting job
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
> in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage
> 6.0 (TID 15, ip, executor 1): java.io.FileNotFoundException: File
> file:file:pathToFile does not exist
> It is possible the underlying files have been updated. You can explicitly
> invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
> in SQL or by recreating the Dataset/DataFrame involved.
> at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.
> nextIterator(FileScanRDD.scala:175)
> at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(
> FileScanRDD.scala:109)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$
> GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
> at org.apache.spark.sql.catalyst.expressions.GeneratedClass$
> GeneratedIterator.processNext(Unknown Source)
> at org.apache.spark.sql.execution.BufferedRowIterator.
> hasNext(BufferedRowIterator.java:43)
> at org.apache.spark.sql.execution.WholeStageCodegenExec$$
> anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
> at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(
> BypassMergeSortShuffleWriter.java:126)
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:96)
> at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:53)
> at org.apache.spark.scheduler.Task.run(Task.scala:99)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1142)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
>
> Driver stacktrace:
>   at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$
> scheduler$DAGScheduler$$failJobAndIndependentStages(
> DAGScheduler.scala:1435)
>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(
> DAGScheduler.scala:1423)
>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(
> DAGScheduler.scala:1422)
>   at scala.collection.mutable.ResizableArray$class.foreach(
> ResizableArray.scala:59)
>   at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
>   at org.apache.spark.scheduler.DAGScheduler.abortStage(
> DAGScheduler.scala:1422)
>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$
> handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
>   at org.apache.spark.scheduler.DAGScheduler$$anonfun$
> handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
>   at scala.Option.foreach(Option.scala:257)
>   at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(
> DAGScheduler.scala:802)
>   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> doOnReceive(DAGScheduler.scala:1650)
>   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> onReceive(DAGScheduler.scala:1605)
>   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> onReceive(DAGScheduler.scala:1594)
>   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
>   at org.apache.spark.scheduler.DAGScheduler.runJob(
> DAGScheduler.scala:628)
>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
>   at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
>   at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
>   at org.apache.spark.rdd.RDDOperationScope$.withScope(
> RDDOperationScope.scala:151)
>   at org.apache.spark.rdd.RDDOperationScope$.withScope(
> RDDOperationScope.scala:112)
>   at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
>   at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
>   at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.
> scala:275)
>   at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$
> Dataset$$execute$1$1.apply(Dataset.scala:2386)
>   at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(
> SQLExecution.scala:57)
>   at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788)
>   at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$
> execute$1(Dataset.scala:2385)
>   at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$
> collect(Dataset.scala:2392)
>   at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.
> scala:2420)
>   at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.
> scala:2419)
>   at org.apache.spark.sql.Dataset.withCallback(Dataset.scala:2801)
>   at org.apache.spark.sql.Dataset.count(Dataset.scala:2419)
>   ... 48 elided
> Caused by: java.io.FileNotFoundException: File file:file:pathToFile does
> not exist
> It is possible the underlying files have been updated. You can explicitly
> invalidate the cache in Spark by running 'REFRESH TABLE tableName' command
> in SQL or by recreating the Dataset/DataFrame involved.
>   at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.
> nextIterator(FileScanRDD.scala:175)
>   at org.apache.spark.sql.execution.datasources.
> FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
>   at org.apache.spark.sql.catalyst.expressions.GeneratedClass$
> GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
>   at org.apache.spark.sql.catalyst.expressions.GeneratedClass$
> GeneratedIterator.processNext(Unknown Source)
>   at org.apache.spark.sql.execution.BufferedRowIterator.
> hasNext(BufferedRowIterator.java:43)
>   at org.apache.spark.sql.execution.WholeStageCodegenExec$$
> anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
>   at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
>   at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(
> BypassMergeSortShuffleWriter.java:126)
>   at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:96)
>   at org.apache.spark.scheduler.ShuffleMapTask.runTask(
> ShuffleMapTask.scala:53)
>   at org.apache.spark.scheduler.Task.run(Task.scala:99)
>   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
>   at java.util.concurrent.ThreadPoolExecutor.runWorker(
> ThreadPoolExecutor.java:1142)
>   at java.util.concurrent.ThreadPoolExecutor$Worker.run(
> ThreadPoolExecutor.java:617)
>   at java.lang.Thread.run(Thread.java:745)
>
> Can somebody help me on fixing this?
>
> Kind Regards,
> Dirceu
>