You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user-zh@flink.apache.org by 陈卓宇 <25...@qq.com.INVALID> on 2022/06/24 03:59:42 UTC

线上flink任务突然出现连续的checkpoint失败

flink版本:1.13.1
hdfs:3+版本
异常日志:

2022-06-24 10:58:19,839 INFO  org.apache.flink.runtime.checkpoint.CheckpointCoordinator    [] - Decline checkpoint 1101 by task b3d88f9ef72bda003056856c4422742d of job 6bd7dc46451f01e008762c9b556cb08f at zhaohy4-test-taskmanager-1-1 @ 10.42.5.55 (dataPort=40558).

org.apache.flink.util.SerializedThrowable: Asynchronous task checkpoint failed.

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.handleExecutionException(AsyncCheckpointRunnable.java:279) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.run(AsyncCheckpointRunnable.java:175) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.lang.Thread.run(Thread.java:748) [?:1.8.0_202]

Caused by: org.apache.flink.util.SerializedThrowable: Could not materialize checkpoint 1101 for operator IntervalJoin(joinType=[InnerJoin], windowBounds=[isRowTime=true, leftLowerBound=-1296000000, leftUpperBound=1296000000, leftTimeIndex=4, rightTimeIndex=4], where=[((hire_contract_id = id) AND (last_modify_time &gt;= (last_modify_time0 - 1296000000:INTERVAL DAY)) AND (last_modify_time <= (last_modify_time0 + 1296000000:INTERVAL DAY)))], select=[hire_contract_id, hire_status_code, sign_date, confirm_date, last_modify_time, proctime, id, hire_contract_code, ziroom_version_id, is_del, last_modify_time0]) -&gt; Calc(select=[hire_contract_id, hire_status_code, sign_date, confirm_date, last_modify_time, proctime, hire_contract_code, ziroom_version_id, is_del AS is_del0, last_modify_time0]) (1/1)#3.

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.handleExecutionException(AsyncCheckpointRunnable.java:257) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;... 4 more

Caused by: org.apache.flink.util.SerializedThrowable: java.io.IOException: Could not flush to file and close the file system output stream to hdfs://zrHdfsHa/user/flink/checkpointsdata/6bd7dc46451f01e008762c9b556cb08f/shared/5a5118ba-427f-4234-8e36-ec8d24418fe4 in order to obtain the stream state handle

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.FutureTask.report(FutureTask.java:122) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.FutureTask.get(FutureTask.java:192) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.concurrent.FutureUtils.runIfNotDoneAndGet(FutureUtils.java:636) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.streaming.api.operators.OperatorSnapshotFinalizer.<init&gt;(OperatorSnapshotFinalizer.java:54) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.run(AsyncCheckpointRunnable.java:128) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;... 3 more

Caused by: org.apache.flink.util.SerializedThrowable: Could not flush to file and close the file system output stream to hdfs://zrHdfsHa/user/flink/checkpointsdata/6bd7dc46451f01e008762c9b556cb08f/shared/5a5118ba-427f-4234-8e36-ec8d24418fe4 in order to obtain the stream state handle

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.state.filesystem.FsCheckpointStreamFactory$FsCheckpointStateOutputStream.closeAndGetHandle(FsCheckpointStreamFactory.java:373) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.contrib.streaming.state.RocksDBStateUploader.uploadLocalFileToCheckpointFs(RocksDBStateUploader.java:143) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.contrib.streaming.state.RocksDBStateUploader.lambda$createUploadFutures$0(RocksDBStateUploader.java:101) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.util.function.CheckedSupplier.lambda$unchecked$0(CheckedSupplier.java:32) ~[flink-core-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1590) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;... 3 more

Caused by: org.apache.flink.util.SerializedThrowable: Unable to close file because the last blockBP-1965840142-10.216.138.23-1585685654447:blk_2926076096_1852445656 does not have enough number of replicas.

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.hadoop.hdfs.DFSOutputStream.completeFile(DFSOutputStream.java:966) ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.hadoop.hdfs.DFSOutputStream.completeFile(DFSOutputStream.java:909) ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.hadoop.hdfs.DFSOutputStream.closeImpl(DFSOutputStream.java:892) ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.hadoop.hdfs.DFSOutputStream.close(DFSOutputStream.java:847) ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72) ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:101) ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.fs.hdfs.HadoopDataOutputStream.close(HadoopDataOutputStream.java:52) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.core.fs.ClosingFSDataOutputStream.close(ClosingFSDataOutputStream.java:64) ~[flink-core-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.state.filesystem.FsCheckpointStreamFactory$FsCheckpointStateOutputStream.closeAndGetHandle(FsCheckpointStreamFactory.java:354) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.contrib.streaming.state.RocksDBStateUploader.uploadLocalFileToCheckpointFs(RocksDBStateUploader.java:143) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.contrib.streaming.state.RocksDBStateUploader.lambda$createUploadFutures$0(RocksDBStateUploader.java:101) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.util.function.CheckedSupplier.lambda$unchecked$0(CheckedSupplier.java:32) ~[flink-core-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1590) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;... 3 more

2022-06-24 10:58:19,844 INFO  org.apache.flink.runtime.jobmaster.JobMaster                 [] - Trying to recover from a global failure.

org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable failure threshold.

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:98) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleTaskLevelCheckpointException(CheckpointFailureManager.java:84) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:1931) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveDeclineMessage(CheckpointCoordinator.java:991) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$declineCheckpoint$2(ExecutionGraphHandler.java:103) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119) ~[flink-dist_2.11-1.13.1.jar:1.13.1]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_202]

&nbsp;&nbsp;&nbsp;&nbsp;at java.lang.Thread.run(Thread.java:748) ~[?:1.8.0_202]

2022-06-24 10:58:19,846 INFO  org.apache.flink.runtime.executiongraph.ExecutionGraph       [] - Job zhaohy4-test (6bd7dc46451f01e008762c9b556cb08f) switched from state RUNNING to RESTARTING.

Re: 线上flink任务突然出现连续的checkpoint失败

Posted by Lijie Wang <wa...@gmail.com>.
-> Caused by: org.apache.flink.util.SerializedThrowable: Unable to close
file because the last
blockBP-1965840142-10.216.138.23-1585685654447:blk_2926076096_1852445656
does not have enough number of replicas.

从错误看是写 hdfs 的问题,建议看下 hdfs 是否正常

Best,
Lijie

陈卓宇 <25...@qq.com.invalid> 于2022年6月24日周五 12:00写道:

> flink版本:1.13.1
> hdfs:3+版本
> 异常日志:
>
> 2022-06-24 10:58:19,839 INFO
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator    [] - Decline
> checkpoint 1101 by task b3d88f9ef72bda003056856c4422742d of job
> 6bd7dc46451f01e008762c9b556cb08f at zhaohy4-test-taskmanager-1-1 @
> 10.42.5.55 (dataPort=40558).
>
> org.apache.flink.util.SerializedThrowable: Asynchronous task checkpoint
> failed.
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.handleExecutionException(AsyncCheckpointRunnable.java:279)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.run(AsyncCheckpointRunnable.java:175)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> [?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> [?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at java.lang.Thread.run(Thread.java:748)
> [?:1.8.0_202]
>
> Caused by: org.apache.flink.util.SerializedThrowable: Could not
> materialize checkpoint 1101 for operator IntervalJoin(joinType=[InnerJoin],
> windowBounds=[isRowTime=true, leftLowerBound=-1296000000,
> leftUpperBound=1296000000, leftTimeIndex=4, rightTimeIndex=4],
> where=[((hire_contract_id = id) AND (last_modify_time &gt;=
> (last_modify_time0 - 1296000000:INTERVAL DAY)) AND (last_modify_time <=
> (last_modify_time0 + 1296000000:INTERVAL DAY)))], select=[hire_contract_id,
> hire_status_code, sign_date, confirm_date, last_modify_time, proctime, id,
> hire_contract_code, ziroom_version_id, is_del, last_modify_time0]) -&gt;
> Calc(select=[hire_contract_id, hire_status_code, sign_date, confirm_date,
> last_modify_time, proctime, hire_contract_code, ziroom_version_id, is_del
> AS is_del0, last_modify_time0]) (1/1)#3.
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.handleExecutionException(AsyncCheckpointRunnable.java:257)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;... 4 more
>
> Caused by: org.apache.flink.util.SerializedThrowable: java.io.IOException:
> Could not flush to file and close the file system output stream to
> hdfs://zrHdfsHa/user/flink/checkpointsdata/6bd7dc46451f01e008762c9b556cb08f/shared/5a5118ba-427f-4234-8e36-ec8d24418fe4
> in order to obtain the stream state handle
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.FutureTask.report(FutureTask.java:122) ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.FutureTask.get(FutureTask.java:192) ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.concurrent.FutureUtils.runIfNotDoneAndGet(FutureUtils.java:636)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.streaming.api.operators.OperatorSnapshotFinalizer.<init&gt;(OperatorSnapshotFinalizer.java:54)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.streaming.runtime.tasks.AsyncCheckpointRunnable.run(AsyncCheckpointRunnable.java:128)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;... 3 more
>
> Caused by: org.apache.flink.util.SerializedThrowable: Could not flush to
> file and close the file system output stream to
> hdfs://zrHdfsHa/user/flink/checkpointsdata/6bd7dc46451f01e008762c9b556cb08f/shared/5a5118ba-427f-4234-8e36-ec8d24418fe4
> in order to obtain the stream state handle
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.state.filesystem.FsCheckpointStreamFactory$FsCheckpointStateOutputStream.closeAndGetHandle(FsCheckpointStreamFactory.java:373)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.contrib.streaming.state.RocksDBStateUploader.uploadLocalFileToCheckpointFs(RocksDBStateUploader.java:143)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.contrib.streaming.state.RocksDBStateUploader.lambda$createUploadFutures$0(RocksDBStateUploader.java:101)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.util.function.CheckedSupplier.lambda$unchecked$0(CheckedSupplier.java:32)
> ~[flink-core-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1590)
> ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;... 3 more
>
> Caused by: org.apache.flink.util.SerializedThrowable: Unable to close file
> because the last
> blockBP-1965840142-10.216.138.23-1585685654447:blk_2926076096_1852445656
> does not have enough number of replicas.
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.hadoop.hdfs.DFSOutputStream.completeFile(DFSOutputStream.java:966)
> ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.hadoop.hdfs.DFSOutputStream.completeFile(DFSOutputStream.java:909)
> ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.hadoop.hdfs.DFSOutputStream.closeImpl(DFSOutputStream.java:892)
> ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.hadoop.hdfs.DFSOutputStream.close(DFSOutputStream.java:847)
> ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.hadoop.fs.FSDataOutputStream$PositionCache.close(FSDataOutputStream.java:72)
> ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.hadoop.fs.FSDataOutputStream.close(FSDataOutputStream.java:101)
> ~[flink-shaded-hadoop-3-uber-3.1.1.7.2.1.0-327-9.0.jar:3.1.1.7.2.1.0-327-9.0]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.fs.hdfs.HadoopDataOutputStream.close(HadoopDataOutputStream.java:52)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.core.fs.ClosingFSDataOutputStream.close(ClosingFSDataOutputStream.java:64)
> ~[flink-core-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.state.filesystem.FsCheckpointStreamFactory$FsCheckpointStateOutputStream.closeAndGetHandle(FsCheckpointStreamFactory.java:354)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.contrib.streaming.state.RocksDBStateUploader.uploadLocalFileToCheckpointFs(RocksDBStateUploader.java:143)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.contrib.streaming.state.RocksDBStateUploader.lambda$createUploadFutures$0(RocksDBStateUploader.java:101)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.util.function.CheckedSupplier.lambda$unchecked$0(CheckedSupplier.java:32)
> ~[flink-core-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1590)
> ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;... 3 more
>
> 2022-06-24 10:58:19,844 INFO
> org.apache.flink.runtime.jobmaster.JobMaster                 [] - Trying to
> recover from a global failure.
>
> org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable
> failure threshold.
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:98)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleTaskLevelCheckpointException(CheckpointFailureManager.java:84)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:1931)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveDeclineMessage(CheckpointCoordinator.java:991)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$declineCheckpoint$2(ExecutionGraphHandler.java:103)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119)
> ~[flink-dist_2.11-1.13.1.jar:1.13.1]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> ~[?:1.8.0_202]
>
> &nbsp;&nbsp;&nbsp;&nbsp;at java.lang.Thread.run(Thread.java:748)
> ~[?:1.8.0_202]
>
> 2022-06-24 10:58:19,846 INFO
> org.apache.flink.runtime.executiongraph.ExecutionGraph       [] - Job
> zhaohy4-test (6bd7dc46451f01e008762c9b556cb08f) switched from state RUNNING
> to RESTARTING.