You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@flink.apache.org by "David Maddison (Jira)" <ji...@apache.org> on 2022/07/20 10:15:00 UTC

[jira] [Comment Edited] (FLINK-23886) An exception is thrown out when recover job timers from checkpoint file

    [ https://issues.apache.org/jira/browse/FLINK-23886?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17568967#comment-17568967 ] 

David Maddison edited comment on FLINK-23886 at 7/20/22 10:14 AM:
------------------------------------------------------------------

I realise this doesn't move the conversation any further toward a solution, but just to add another data point to show that this happens in different environments.  Job has been running for 37 days, again Flink 13.5, commit 0ff28a7 @ 2021-12-14T23:26:04+01:00
{code:java}
org.apache.flink.util.FlinkRuntimeException: Error while deserializing the element.
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.deserializeElement(RocksDBCachingPriorityQueueSet.java:388)
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.peek(RocksDBCachingPriorityQueueSet.java:145)
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.peek(RocksDBCachingPriorityQueueSet.java:58)
	at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue$InternalPriorityQueueComparator.comparePriority(KeyGroupPartitionedPriorityQueue.java:284)
	at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue$InternalPriorityQueueComparator.comparePriority(KeyGroupPartitionedPriorityQueue.java:271)
	at org.apache.flink.runtime.state.heap.HeapPriorityQueue.isElementPriorityLessThen(HeapPriorityQueue.java:161)
	at org.apache.flink.runtime.state.heap.HeapPriorityQueue.siftUp(HeapPriorityQueue.java:118)
	at org.apache.flink.runtime.state.heap.HeapPriorityQueue.addInternal(HeapPriorityQueue.java:82)
	at org.apache.flink.runtime.state.heap.AbstractHeapPriorityQueue.add(AbstractHeapPriorityQueue.java:71)
	at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue.<init>(KeyGroupPartitionedPriorityQueue.java:93)
	at org.apache.flink.contrib.streaming.state.RocksDBPriorityQueueSetFactory.create(RocksDBPriorityQueueSetFactory.java:113)
	at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackend.create(RocksDBKeyedStateBackend.java:476)
	at org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl.createTimerPriorityQueue(InternalTimeServiceManagerImpl.java:174)
	at org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl.registerOrGetTimerService(InternalTimeServiceManagerImpl.java:158)
	at org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl.getInternalTimerService(InternalTimeServiceManagerImpl.java:136)
	at org.apache.flink.streaming.api.operators.AbstractStreamOperator.getInternalTimerService(AbstractStreamOperator.java:620)
	at org.apache.flink.streaming.runtime.operators.windowing.WindowOperator.open(WindowOperator.java:225)
	at org.apache.flink.streaming.runtime.tasks.OperatorChain.initializeStateAndOpenOperators(OperatorChain.java:442)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:585)
	at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.call(StreamTaskActionExecutor.java:55)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.executeRestore(StreamTask.java:565)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.runWithCleanUpOnFail(StreamTask.java:650)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:540)
	at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:759)
	at org.apache.flink.runtime.taskmanager.Task.run(Task.java:566)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.io.EOFException
	at org.apache.flink.core.memory.DataInputDeserializer.readUnsignedByte(DataInputDeserializer.java:329)
	at org.apache.flink.types.StringValue.readString(StringValue.java:786)
	at org.apache.flink.api.common.typeutils.base.StringSerializer.deserialize(StringSerializer.java:73)
	at org.apache.flink.api.common.typeutils.base.StringSerializer.deserialize(StringSerializer.java:31)
	at org.apache.flink.streaming.api.operators.TimerSerializer.deserialize(TimerSerializer.java:161)
	at org.apache.flink.streaming.api.operators.TimerSerializer.deserialize(TimerSerializer.java:43)
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.deserializeElement(RocksDBCachingPriorityQueueSet.java:386)
	... 25 more
 {code}


was (Author: maddisondavid):
I realise this doesn't move the conversation any further to a solution, but just to add another data point to show that this happens in different environments.  Job has been running for 37 days, again Flink 13.5, commit 0ff28a7 @ 2021-12-14T23:26:04+01:00
{code:java}
org.apache.flink.util.FlinkRuntimeException: Error while deserializing the element.
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.deserializeElement(RocksDBCachingPriorityQueueSet.java:388)
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.peek(RocksDBCachingPriorityQueueSet.java:145)
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.peek(RocksDBCachingPriorityQueueSet.java:58)
	at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue$InternalPriorityQueueComparator.comparePriority(KeyGroupPartitionedPriorityQueue.java:284)
	at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue$InternalPriorityQueueComparator.comparePriority(KeyGroupPartitionedPriorityQueue.java:271)
	at org.apache.flink.runtime.state.heap.HeapPriorityQueue.isElementPriorityLessThen(HeapPriorityQueue.java:161)
	at org.apache.flink.runtime.state.heap.HeapPriorityQueue.siftUp(HeapPriorityQueue.java:118)
	at org.apache.flink.runtime.state.heap.HeapPriorityQueue.addInternal(HeapPriorityQueue.java:82)
	at org.apache.flink.runtime.state.heap.AbstractHeapPriorityQueue.add(AbstractHeapPriorityQueue.java:71)
	at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue.<init>(KeyGroupPartitionedPriorityQueue.java:93)
	at org.apache.flink.contrib.streaming.state.RocksDBPriorityQueueSetFactory.create(RocksDBPriorityQueueSetFactory.java:113)
	at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackend.create(RocksDBKeyedStateBackend.java:476)
	at org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl.createTimerPriorityQueue(InternalTimeServiceManagerImpl.java:174)
	at org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl.registerOrGetTimerService(InternalTimeServiceManagerImpl.java:158)
	at org.apache.flink.streaming.api.operators.InternalTimeServiceManagerImpl.getInternalTimerService(InternalTimeServiceManagerImpl.java:136)
	at org.apache.flink.streaming.api.operators.AbstractStreamOperator.getInternalTimerService(AbstractStreamOperator.java:620)
	at org.apache.flink.streaming.runtime.operators.windowing.WindowOperator.open(WindowOperator.java:225)
	at org.apache.flink.streaming.runtime.tasks.OperatorChain.initializeStateAndOpenOperators(OperatorChain.java:442)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:585)
	at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.call(StreamTaskActionExecutor.java:55)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.executeRestore(StreamTask.java:565)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.runWithCleanUpOnFail(StreamTask.java:650)
	at org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:540)
	at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:759)
	at org.apache.flink.runtime.taskmanager.Task.run(Task.java:566)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.io.EOFException
	at org.apache.flink.core.memory.DataInputDeserializer.readUnsignedByte(DataInputDeserializer.java:329)
	at org.apache.flink.types.StringValue.readString(StringValue.java:786)
	at org.apache.flink.api.common.typeutils.base.StringSerializer.deserialize(StringSerializer.java:73)
	at org.apache.flink.api.common.typeutils.base.StringSerializer.deserialize(StringSerializer.java:31)
	at org.apache.flink.streaming.api.operators.TimerSerializer.deserialize(TimerSerializer.java:161)
	at org.apache.flink.streaming.api.operators.TimerSerializer.deserialize(TimerSerializer.java:43)
	at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.deserializeElement(RocksDBCachingPriorityQueueSet.java:386)
	... 25 more
 {code}

> An exception is thrown out when recover job timers from checkpoint file
> -----------------------------------------------------------------------
>
>                 Key: FLINK-23886
>                 URL: https://issues.apache.org/jira/browse/FLINK-23886
>             Project: Flink
>          Issue Type: Bug
>          Components: Runtime / State Backends
>    Affects Versions: 1.10.0, 1.11.3, 1.13.2
>            Reporter: Jing Zhang
>            Priority: Major
>         Attachments: image-2021-08-25-16-38-04-023.png, image-2021-08-25-16-38-12-308.png, image-2021-08-25-17-06-29-806.png, image-2021-08-25-17-07-38-327.png
>
>
> A user report the bug in the [mailist. |http://mail-archives.apache.org/mod_mbox/flink-user/202108.mbox/%3CCAKmSf43J14nkjMgjUy4dh5qN2VbJtw4TFh4pmmuyVCvfhgf5tQ@mail.gmail.com%3E]I paste the content here.
> Setup Specifics:
>  Version: 1.6.2
>  RocksDB Map State
>  Timers stored in rocksdb
>   
>  When we have this job running for long periods of time like > 30 days, if for some reason the job restarts, we encounter "Error while deserializing the element". Is this a known issue fixed in later versions? I see some changes to code for FLINK-10175, but we don't use any queryable state 
>   
>  Below is the stack trace
>   
>  org.apache.flink.util.FlinkRuntimeException: Error while deserializing the element.
> at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.deserializeElement(RocksDBCachingPriorityQueueSet.java:389)
> at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.peek(RocksDBCachingPriorityQueueSet.java:146)
> at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.peek(RocksDBCachingPriorityQueueSet.java:56)
> at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue$InternalPriorityQueueComparator.comparePriority(KeyGroupPartitionedPriorityQueue.java:274)
> at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue$InternalPriorityQueueComparator.comparePriority(KeyGroupPartitionedPriorityQueue.java:261)
> at org.apache.flink.runtime.state.heap.HeapPriorityQueue.isElementPriorityLessThen(HeapPriorityQueue.java:164)
> at org.apache.flink.runtime.state.heap.HeapPriorityQueue.siftUp(HeapPriorityQueue.java:121)
> at org.apache.flink.runtime.state.heap.HeapPriorityQueue.addInternal(HeapPriorityQueue.java:85)
> at org.apache.flink.runtime.state.heap.AbstractHeapPriorityQueue.add(AbstractHeapPriorityQueue.java:73)
> at org.apache.flink.runtime.state.heap.KeyGroupPartitionedPriorityQueue.<init>(KeyGroupPartitionedPriorityQueue.java:89)
> at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackend$RocksDBPriorityQueueSetFactory.create(RocksDBKeyedStateBackend.java:2792)
> at org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackend.create(RocksDBKeyedStateBackend.java:450)
> at org.apache.flink.streaming.api.operators.InternalTimeServiceManager.createTimerPriorityQueue(InternalTimeServiceManager.java:121)
> at org.apache.flink.streaming.api.operators.InternalTimeServiceManager.registerOrGetTimerService(InternalTimeServiceManager.java:106)
> at org.apache.flink.streaming.api.operators.InternalTimeServiceManager.getInternalTimerService(InternalTimeServiceManager.java:87)
> at org.apache.flink.streaming.api.operators.AbstractStreamOperator.getInternalTimerService(AbstractStreamOperator.java:764)
> at org.apache.flink.streaming.api.operators.KeyedProcessOperator.open(KeyedProcessOperator.java:61)
> at org.apache.flink.streaming.runtime.tasks.StreamTask.openAllOperators(StreamTask.java:424)
> at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:290)
> at org.apache.flink.runtime.taskmanager.Task.run(Task.java:711)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: java.io.EOFException
> at java.io.DataInputStream.readUnsignedByte(DataInputStream.java:290)
> at org.apache.flink.types.StringValue.readString(StringValue.java:769)
> at org.apache.flink.api.common.typeutils.base.StringSerializer.deserialize(StringSerializer.java:69)
> at org.apache.flink.api.common.typeutils.base.StringSerializer.deserialize(StringSerializer.java:28)
> at org.apache.flink.api.java.typeutils.runtime.RowSerializer.deserialize(RowSerializer.java:179)
> at org.apache.flink.api.java.typeutils.runtime.RowSerializer.deserialize(RowSerializer.java:46)
> at org.apache.flink.streaming.api.operators.TimerSerializer.deserialize(TimerSerializer.java:168)
> at org.apache.flink.streaming.api.operators.TimerSerializer.deserialize(TimerSerializer.java:45)
> at org.apache.flink.contrib.streaming.state.RocksDBCachingPriorityQueueSet.deserializeElement(RocksDBCachingPriorityQueueSet.java:387)
> ... 20 more



--
This message was sent by Atlassian Jira
(v8.20.10#820010)